# experiment for paper

In [1]:
import os
import json
from collections import defaultdict
import subprocess


# Build Tool Database

In [None]:
folders = os.listdir("../extractor/apidocs")
parameter_dict = defaultdict(set)
def add_to_dict(d: defaultdict, prev_path: str, object, api_doc: str):
    '''encode a json tree into a dictionary with root to leaf path. sets have max length of 10'''
    if isinstance(object, dict):
        for key, value in object.items():
            add_to_dict(d, prev_path + "[" + key + "]", value, api_doc)
    elif isinstance(object, list):
        for item in object:
            add_to_dict(d, prev_path, item, api_doc)
    else:
        if object:
            if len(d[prev_path]) < 10:
                d[prev_path].add((api_doc, object))
def build_dict(d, object, api_doc):
    add_to_dict(d, '', object, api_doc)
for folder in folders:
    path = "../extractor/apidocs/" + folder + "/" + folder + ".txt"
    try:
        json_file = json.load(open(path))
    except:
        continue
    for endpoint in json_file['endpoints']:
        for param in endpoint['required_parameters']:
            name = param['name']
            example = None if not param['example'] else param['example']
            add_to_dict(parameter_dict, "["+name+"]", example, folder)
            
        if endpoint['optional_parameters']:
            for param in endpoint['optional_parameters']:
                name = param['name']
                example = None if not param['example'] else param['example']
                add_to_dict(parameter_dict, "["+name+"]", example, folder)
parameter_dict


In [None]:
folders = os.listdir("../extractor/apidocs")
description_to_param_dict = {} # map description to parameter
param_to_description_dict = {} # map parameter to description
for folder in folders:
    path = "../extractor/apidocs/" + folder + "/" + folder + ".txt"
    try:
        json_file = json.load(open(path))
    except:
        continue
    for endpoint in json_file['endpoints']:
        for param in endpoint['required_parameters']:
            name = param['name']
            description = param['description']
            description_to_param_dict[description] = name
            param_to_description_dict[name] = description
        if endpoint['optional_parameters']:
            for param in endpoint['optional_parameters']:
                name = param['name']
                description = param['description']
                description_to_param_dict[description] = name
                param_to_description_dict[name] = description

description_to_param_dict

# Get API response

In [None]:
# no need to run again
apidocs_dir = "../extractor/apidocs"
count = 0
evaluation_result={}
folders = os.listdir(apidocs_dir)
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    failed_endpoints = []
    api_result={}
    files = [x for x in files if x.endswith(".py")]
    for file_name in files:
        if file_name.endswith(".py"):
            count += 1
            file_path = os.path.join(apidocs_dir, folder, file_name)
            try:
                result = subprocess.run(
                    ["python", file_path], capture_output=True, text=True, check=True
                )
                if result.stdout:
                    output = result.stdout
                    output_json = json.loads(output)
                    # save in file
                    with open(file_path[:-3] + '_response.json', "w") as f:
                        json.dump(output_json, f)
                else:
                    pass
            except subprocess.CalledProcessError as e:
                pass
            print(f"Tested: {file_name}")


In [None]:
apidocs_dir = "../extractor/apidocs"
count = 0
evaluation_result={}
folders = os.listdir(apidocs_dir)
response_dict = defaultdict(set)
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    failed_endpoints = []
    api_result={}
    files = [x for x in files if x.endswith("_response.json")]
    for file_name in files:
        file_path = os.path.join(apidocs_dir, folder, file_name)
        response = json.load(open(file_path))
        if response['status_code'] == 200:
            response_json = response['json']
            build_dict(response_dict, response_json, folder)
response_dict



# Calculate embedding, build parameter knowledge base

In [None]:
from charset_normalizer import md__mypyc
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")
param_keys = list(parameter_dict.keys())
param_keys_emb = model.encode(param_keys, convert_to_tensor=True)
response_keys = list(response_dict.keys())
response_keys_emb = model.encode(response_keys, convert_to_tensor=True)
description_keys = list(description_to_param_dict.keys())
description_keys_emb = model.encode(description_keys, convert_to_tensor=True)


In [None]:
query = 'iupacextended'
query_emb = model.encode(query, convert_to_tensor=True) 
hits_param = util.semantic_search(query_emb, param_keys_emb)
hits_response = util.semantic_search(query_emb, response_keys_emb)
query_description = param_to_description_dict[query]
query_description_emb = model.encode(query_description, convert_to_tensor=True)
hits_description = util.semantic_search(query_description_emb, description_keys_emb)

In [None]:
h = hits_param[0]
for hit in h:
    print(f"Param: {param_keys[hit['corpus_id']]} Example: {parameter_dict[param_keys[hit['corpus_id']]]}")
    print(f"Similarity: {hit['score']}")
    print("")

print("==================================")
h = hits_response[0]
for hit in h:
    print(f"Request: {response_keys[hit['corpus_id']]} Example: {response_dict[response_keys[hit['corpus_id']]]}")
    print(f"Similarity: {hit['score']}")
    print("")

print("==================================")
h = hits_description[0]
for hit in h:
    print(f"Description: {description_keys[hit['corpus_id']]} Example: {parameter_dict['['+description_to_param_dict[description_keys[hit['corpus_id']]]+']']}")
    print(f"Similarity: {hit['score']}")
    print("")

# load generated tools

In [7]:
import sys
def load_and_import(api_name, function_name):
    folder_path = os.path.join("..","extractor", "apidocs", api_name)
    sys.path.insert(0, folder_path)
    module = __import__(function_name)
    sys.path.pop(0)  # Clean up after import
    return module

module = load_and_import("glycanformatconverter", "convert_glycoct_to_wurcs_GET")

In [None]:
f = getattr(module, "convert_glycoct_to_wurcs")
response = f(glycoct='''RES\n1b:x-dglc-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dman-HEX-1:5\n6b:a-dman-HEX-1:5\n7b:a-dman-HEX-1:5\nLIN\n1:1d(2+1)2n\n2:1o(4+1)3d\n3:3d(2+1)4n\n4:3o(4+1)5d\n5:5o(3+1)6d\n6:5o(6+1)7d''')


# query KB and get parameter value candidates

In [34]:
def get_function(api_name, endpoint_name):
    function_name = endpoint_name.replace("_GET", "").replace("_POST", "")
    module = load_and_import(api_name, endpoint_name)
    f = getattr(module, function_name)
    return f

import random
def get_test_examples(query, api_doc_name, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5, query_description=None):
    if not query_description:
        try:
            query_description = param_to_description_dict[query]
        except:
            query_description = None
    query_emb = model.encode(query, convert_to_tensor=True) 
    hits_param = util.semantic_search(query_emb, param_keys_emb)
    hits_response = util.semantic_search(query_emb, response_keys_emb)
    if query_description:
        query_description_emb = model.encode(query_description, convert_to_tensor=True)
        hits_description = util.semantic_search(query_description_emb, description_keys_emb)
    hit_list = hits_param[0]
    all_examples = set()
    count = 0
    for hit in hit_list:
        if hit['score'] < similarity_threshold or count > max_param_examples:
            break
        param_key = param_keys[hit['corpus_id']]
        examples_set = parameter_dict[param_key]
        filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
        # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
        examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
        sidx = param_key.rfind('[')+1
        test_param = param_key[sidx:-1]
        # print(test_param, examples)
        pairs = [(test_param, example) for example in examples]
        all_examples.update(pairs)
        # print(all_examples)
        count += len(examples)
    hit_list = hits_response[0]
    count = 0
    for hit in hit_list:
        if hit['score'] < similarity_threshold or count > max_response_examples:
            break
        response_key = response_keys[hit['corpus_id']]
        examples_set = response_dict[response_key]
        examples_set_copy = examples_set.copy()
        filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
        # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
        examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
        sidx = response_key.rfind('[')+1
        test_param = response_key[sidx:-1]
        # print(test_param, examples)
        pairs = [(test_param, example) for example in examples]
        all_examples.update(pairs)
        # print(all_examples)
        count += len(examples)
    if query_description:
        hit_list = hits_description[0]
        count = 0
        for hit in hit_list:
            if hit['score'] < similarity_threshold or count > max_param_examples:
                break
            description_key = description_keys[hit['corpus_id']]
            param_key = description_to_param_dict[description_key]
            examples_set = parameter_dict['['+param_key+']']
            filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
            # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
            examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
            # print(param_key, examples)
            pairs = [(param_key, example) for example in examples]
            all_examples.update(pairs)
            # print(all_examples)
            count += len(examples)
    return all_examples

    

In [None]:
get_test_examples("iupacextended", "glycanformatconverter", max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)

In [36]:
import inspect
import time
import itertools
def get_parameter_names(func):
    signature = inspect.signature(func)
    return [param.name for param in signature.parameters.values()]
def run_experiment(MAX_SAMPLES=100):
    apidocs_dir = "../extractor/apidocs"
    folders = os.listdir(apidocs_dir)
    for folder in folders:
        files = os.listdir(os.path.join(apidocs_dir, folder))
        files = [x for x in files if x.endswith(".py")]
        api_extracted_json = {}
        with open(os.path.join(apidocs_dir, folder, folder + ".txt")) as f:
            api_extracted_json = json.load(f)
        extracted_endpoints = None
        try:
            extracted_endpoints = api_extracted_json['endpoints']
        except:
            print("No extracted information for api: ", folder)
            extracted_endpoints = None
            continue
        for idx, file_name in enumerate(files):
            endpoint_name = file_name.replace(".py", "")
            record_file_name = endpoint_name + "_example_test.json"
            if os.path.exists(os.path.join(apidocs_dir, folder, record_file_name)):
                endpoint_result = json.load(open(os.path.join(apidocs_dir, folder, record_file_name)))
            else:
                endpoint_result = {"endpoint": endpoint_name, "tests": [], "extracted_parameters": {}, "validated_parameters": {}}
                print(f"Testing {folder}.{endpoint_name}")
                if file_name == "__init__.py":
                    continue
                
            if extracted_endpoints:
                # find the ground truth parameters
                pass
                # endpoint_result['extracted_parameters'] = {}
                # current_endpoint = None
                # for endpoint in extracted_endpoints:
                #     current_endpoint_name = endpoint['name'].replace(' ','_').lower() + '_' + endpoint['method']
                #     if current_endpoint_name == endpoint_name:
                #         current_endpoint = endpoint
                #         print(f"Found endpoint {endpoint_name} in extracted information")
                #         break
                # if not current_endpoint:
                #     print(f"Endpoint {endpoint_name} not found in extracted information")
                #     continue
                # for req_param in current_endpoint['required_parameters']:
                #     endpoint_result['extracted_parameters'][req_param['name']] = {'description': req_param['description'], 'example': req_param['example']}
                # if current_endpoint['optional_parameters']:
                #     for opt_param in current_endpoint['optional_parameters']:
                #         endpoint_result['extracted_parameters'][opt_param['name']] = {'description': opt_param['description'], 'example': opt_param['example']}
            f = get_function(folder, endpoint_name)
            param_names = get_parameter_names(f)
            # simulate the situation that no example is provided
            # for a single parameter function, we can try a few test cases.
            if len(param_names) == 1:
                endpoint_result['tests'] = []
                curr_param = param_names[0]
                examples = get_test_examples(curr_param, folder, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)
                for test_param, example in examples:
                    try:
                        response = f(example)
                    except:
                        print(f'Error occured when calling {folder}.{endpoint_name} with {curr_param}={example}')
                        continue
                    response_json =""
                    try:
                        response_json = response.json()
                    except:
                        pass
                    example_result = {'gt_param': curr_param,'test_param': test_param, 'candidate': example, 'status_code': response.status_code, 'json': response_json, 'text': response.text}
                    endpoint_result['tests'].append(example_result)
                    time.sleep(0.1)
                pass 
            else:
                # use a rough estimation, record the ranking of the ground truth parameter(?)
                endpoint_result['tests'] = []
                example_list = []
                for param in param_names:
                    examples = get_test_examples(param, folder, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)
                    example_list.append(examples)
                candidate_combinations = list(itertools.product(*example_list))
                samples = random.sample(candidate_combinations, min(MAX_SAMPLES, len(candidate_combinations)))
                print(f"Testing {folder}.{endpoint_name} with {len(samples)} samples")
                for sample in samples:
                    try:
                        input_dict = {param_names[i]: sample[i][1] for i in range(len(param_names))}
                        response = f(**input_dict)
                    except:
                        print(f'Error occured when calling {folder}.{endpoint_name} with {input_dict}')
                        continue
                    response_json =""
                    try:
                        response_json = response.json()
                    except:
                        pass
                    example_result = {'gt_param': param_names,'test_param': [t[0] for t in sample], 'candidate': [t[1] for t in sample], 'status_code': response.status_code, 'json': response_json, 'text': response.text}
                    endpoint_result['tests'].append(example_result)
                    time.sleep(0.1)
                pass
            # save endpoint_result
            save_path = os.path.join(apidocs_dir, folder, endpoint_name + "_example_test.json")
            with open(save_path, "w") as f:
                json.dump(endpoint_result, f)



In [None]:
run_experiment()

# let gpt evaluate the tested examples
1. check if status code is 200
2. chunk the response
3. sent response to gpt, let it check if the API returns an actual response rather than an error information

In [38]:
from langchain.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
tagging_prompt = ChatPromptTemplate.from_template(
    """
Decide if the following API response is an information or an error message.

API Description:
{description}
API Response:
{response}
"""
)
class Classification(BaseModel):
    response_type: str= Field(..., enum=['information', 'error'])
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini").with_structured_output(Classification)

def gpt_evaluate(API_description:str, API_response: str):
    '''let gpt to decide if the API response is a piece of information or an error message'''
    prompt = tagging_prompt.invoke({"description": API_description, "response": API_response})
    gpt_response = llm.invoke(prompt)
    return gpt_response.response_type

In [None]:
import os
import json
apidocs_dir = "../extractor/apidocs"
folders = os.listdir(apidocs_dir)
count_single_param_files = 0
count_multi_param_files = 0
count_validated_files = 0
count_validated_single_param_files = 0
count_validated_multi_param_files = 0
count_find_by_scemantic = 0
count_find_by_scemantic_single = 0
count_find_by_scemantic_multi = 0
scemantic_case = []
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    files = [x for x in files if x.endswith("_example_test.json")]
    for file_name in files:
        file_path = os.path.join(apidocs_dir, folder, file_name)
        # if json file is larger than 10MB, skip it
        if os.path.getsize(file_path) > 100*1024*1024:
            continue
        endpoint_result = json.load(open(file_path))
        endpoint_result['validated_parameters'] = {}
        print(f"Validating {folder}.{endpoint_result['endpoint']}")
        saved_error_responses = set() # API responses usually have similar error messages, save them to avoid repeated evaluation
        if endpoint_result['tests']:
            if type(endpoint_result['tests'][0]['gt_param']) == str:
                count_single_param_files += 1
            else:
                count_multi_param_files += 1
        for test in endpoint_result['tests']:
            status_code = test['status_code']
            if status_code == 200:
                response_text = test['text']
                # truncate the response text to avoid the max token limit of gpt
                response_text = response_text[:500] if len(response_text) > 500 else response_text
                if response_text in saved_error_responses:
                    continue
                else:
                    # if type(test['gt_param']) == str:
                    #     continue
                    gpt_response = gpt_evaluate(endpoint_result['endpoint'],response_text)
                    if gpt_response == 'error':
                        print(f"Error response detected: {response_text}")
                        saved_error_responses.add(response_text)
                    else:
                        print(f"Information response detected: {response_text}")
                        count_validated_files += 1
                        if type(test['gt_param']) == str:
                            endpoint_result['validated_parameters'][test['gt_param']] = test['candidate']
                            count_validated_single_param_files += 1
                            if not test['test_param'] in endpoint_result['extracted_parameters']:
                                count_find_by_scemantic += 1
                                count_find_by_scemantic_single += 1
                                scemantic_case.append((folder, endpoint_result['endpoint'], test['test_param'], test['gt_param']))
                        else:
                            count_validated_multi_param_files += 1
                            for i, gt_param in enumerate(test['gt_param']):
                                endpoint_result['validated_parameters'][gt_param] = test['candidate'][i]
                                if not test['test_param'][i] in endpoint_result['extracted_parameters']:
                                    count_find_by_scemantic += 1
                                    count_find_by_scemantic_multi += 1
                                    scemantic_case.append((folder, endpoint_result['endpoint'], test['test_param'][i], gt_param))
                        break
        with open(file_path, "w") as f:
            json.dump(endpoint_result, f)
print(f"Validated {count_validated_files} parameters in {count_multi_param_files+count_single_param_files} files. {count_find_by_scemantic} parameters are found by semantic search.") 
print(f"Validated {count_validated_single_param_files} single parameters in {count_single_param_files} files. {count_find_by_scemantic_single} single parameters are found by semantic search.")
print(f"Validated {count_validated_multi_param_files} multi parameters in {count_multi_param_files} files. {count_find_by_scemantic_multi} multi parameters are found by semantic search.")
print(scemantic_case)

            