# experiment for paper

In [1]:
import os
import json
from collections import defaultdict
import subprocess


# Build Tool Database

In [2]:
folders = os.listdir("../extractor/apidocs")
parameter_dict = defaultdict(set)
def add_to_dict(d: defaultdict, prev_path: str, object, api_doc: str):
    '''encode a json tree into a dictionary with root to leaf path. sets have max length of 10'''
    if isinstance(object, dict):
        for key, value in object.items():
            add_to_dict(d, prev_path + "[" + key + "]", value, api_doc)
    elif isinstance(object, list):
        for item in object:
            add_to_dict(d, prev_path, item, api_doc)
    else:
        if object:
            if len(d[prev_path]) < 10:
                d[prev_path].add((api_doc, object))
def build_dict(d, object, api_doc):
    add_to_dict(d, '', object, api_doc)
for folder in folders:
    path = "../extractor/apidocs/" + folder + "/" + folder + ".txt"
    try:
        json_file = json.load(open(path))
    except:
        continue
    for endpoint in json_file['endpoints']:
        for param in endpoint['required_parameters']:
            name = param['name']
            example = None if not param['example'] else param['example']
            add_to_dict(parameter_dict, "["+name+"]", example, folder)
            
        if endpoint['optional_parameters']:
            for param in endpoint['optional_parameters']:
                name = param['name']
                example = None if not param['example'] else param['example']
                add_to_dict(parameter_dict, "["+name+"]", example, folder)
parameter_dict


defaultdict(set,
            {'[composition]': {('composition', 'example_composition_text'),
              ('glyconnect', '5,4,2,1,0,0,0,0,0,0,0,0,0,0')},
             '[composition][hex]': {('composition', '1'),
              ('composition', '2')},
             '[composition][hexnac]': {('composition', '1'),
              ('composition', '2')},
             '[composition][dhex]': {('composition', '0'),
              ('composition', '1'),
              ('composition', '3')},
             '[composition][neu5ac]': {('composition', '0'),
              ('composition', '1'),
              ('composition', '2')},
             '[composition][neu5gc]': {('composition', '0'),
              ('composition', '1')},
             '[composition][P]': {('composition', '0'), ('composition', '1')},
             '[composition][S]': {('composition', '0'), ('composition', '1')},
             '[composition][Ac]': {('composition', '0'), ('composition', '1')},
             '[sequence]': {('glycam',
           

In [18]:
folders = os.listdir("../extractor/apidocs")
description_to_param_dict = {} # map description to parameter
param_to_description_dict = {} # map parameter to description
for folder in folders:
    path = "../extractor/apidocs/" + folder + "/" + folder + ".txt"
    try:
        json_file = json.load(open(path))
    except:
        continue
    for endpoint in json_file['endpoints']:
        for param in endpoint['required_parameters']:
            name = param['name']
            description = param['description']
            description_to_param_dict[description] = name
            param_to_description_dict[name] = description
        if endpoint['optional_parameters']:
            for param in endpoint['optional_parameters']:
                name = param['name']
                description = param['description']
                description_to_param_dict[description] = name
                param_to_description_dict[name] = description

description_to_param_dict

{'Composition text to be converted to WURCS format.': 'composition',
 'Array of composition objects to be converted to WURCS format.': 'composition',
 'The sequence of the sugar in condensed glycam notation.': 'sequence',
 'GlycoCT format text.': 'glycoct',
 'IUPAC Extended format text.': 'iupacextended',
 'IUPAC Condensed format text.': 'iupaccondensed',
 'LinearCode format text.': 'linearcode',
 'KCF format text.': 'kcf',
 'WURCS-JSON format text.': 'wurcsjson',
 'WURCS format text.': 'wurcs',
 'Image format.': 'format',
 'Data type.': 'type',
 'GlycoCT representation of glycan used to convert': 'glycoct',
 'Graphical representation type (SNFG representation uses cfg notation)': 'notation',
 'Image file format': 'format',
 'GWS representation of glycan used to convert': 'gws',
 'Iupac representation of glycan to convert to GlycoCT': 'iupac',
 'Glycan type': 'glycan_type',
 'GlycoCT representation of glycan to convert to GWS': 'glycoct',
 'GWS representation of glycan to convert to Gl

# Get API response

In [None]:
# no need to run again
apidocs_dir = "../extractor/apidocs"
count = 0
evaluation_result={}
folders = os.listdir(apidocs_dir)
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    failed_endpoints = []
    api_result={}
    files = [x for x in files if x.endswith(".py")]
    for file_name in files:
        if file_name.endswith(".py"):
            count += 1
            file_path = os.path.join(apidocs_dir, folder, file_name)
            try:
                result = subprocess.run(
                    ["python", file_path], capture_output=True, text=True, check=True
                )
                if result.stdout:
                    output = result.stdout
                    output_json = json.loads(output)
                    # save in file
                    with open(file_path[:-3] + '_response.json', "w") as f:
                        json.dump(output_json, f)
                else:
                    pass
            except subprocess.CalledProcessError as e:
                pass
            print(f"Tested: {file_name}")


Tested: convert_composition_to_wurcs__get__GET.py
Tested: convert_composition_to_wurcs__post__POST.py
Tested: build_structure_via_url_GET.py
Tested: convert_glycoct_to_wurcs_GET.py
Tested: convert_iupac_condensed_to_wurcs_GET.py
Tested: convert_iupac_extended_to_wurcs_GET.py
Tested: convert_kcf_to_wurcs_GET.py
Tested: convert_linearcode_to_wurcs_GET.py
Tested: convert_wurcs_json_to_wurcs_GET.py
Tested: convert_wurcs_to_glycam_sequence_GET.py
Tested: convert_wurcs_to_glycoct_GET.py
Tested: convert_wurcs_to_iupac_condensed_GET.py
Tested: convert_wurcs_to_iupac_extended_GET.py
Tested: convert_wurcs_to_wurcs_json_GET.py
Tested: normalize_glycoct_GET.py
Tested: normalize_wurcs_GET.py
Tested: wurcs_to_image_GET.py
Tested: wurcs_to_image_POST.py
Tested: convertcompositions_POST.py
Tested: convertglycocttogws_POST.py
Tested: convertgwstoglycoct_POST.py
Tested: convertiupactoglycoct_POST.py
Tested: getcartoonfromsequenceglycoct_POST.py
Tested: getcartoonfromsequencegws_POST.py
Tested: getcompos

In [3]:
apidocs_dir = "../extractor/apidocs"
count = 0
evaluation_result={}
folders = os.listdir(apidocs_dir)
response_dict = defaultdict(set)
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    failed_endpoints = []
    api_result={}
    files = [x for x in files if x.endswith("_response.json")]
    for file_name in files:
        file_path = os.path.join(apidocs_dir, folder, file_name)
        response = json.load(open(file_path))
        if response['status_code'] == 200:
            response_json = response['json']
            build_dict(response_dict, response_json, folder)
response_dict



defaultdict(set,
            {'[id]': {('glycanformatconverter', 'G20624LQ'),
              ('glycanformatconverter', 'G22768VO'),
              ('glycosmos-otherapis', 'G03717EM'),
              ('glygen', '669dae2d4745e2a7500f4044'),
              ('glygen', '669dae3fbd6b77d5ba1bc488'),
              ('glygen', '669dae5436a110727358bd47'),
              ('glygen', '66d1def3b29b17dfd193489c'),
              ('glygen', '66db3e0b982755a6709c9e1d'),
              ('glygen', '672ea81e9a8d2e6730dfdbf8'),
              ('glytoucandata', 'G22768VO')},
             '[wurcs]': {('glycanformatconverter',
               'WURCS=2.0/3,5,4/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5]/1-1-2-3-3/a4-b1_b4-c1_c3-d1_c6-e1'),
              ('glycanformatconverter',
               'WURCS=2.0/4,5,4/[a2122h-1x_1-5_2*NCC/3=O][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5]/1-2-3-4-4/a4-b1_b4-c1_c3-d1_c6-e1'),
              ('glycosmos-otherapis',
               'WURCS=2.0/3,4,3/[a2122h-1b_1

# Calculate embedding, build parameter knowledge base

In [12]:
from charset_normalizer import md__mypyc
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("flax-sentence-embeddings/st-codesearch-distilroberta-base")
param_keys = list(parameter_dict.keys())
param_keys_emb = model.encode(param_keys, convert_to_tensor=True)
response_keys = list(response_dict.keys())
response_keys_emb = model.encode(response_keys, convert_to_tensor=True)
description_keys = list(description_to_param_dict.keys())
description_keys_emb = model.encode(description_keys, convert_to_tensor=True)




In [None]:
query = 'iupacextended'
query_emb = model.encode(query, convert_to_tensor=True) 
hits_param = util.semantic_search(query_emb, param_keys_emb)
hits_response = util.semantic_search(query_emb, response_keys_emb)
query_description = param_to_description_dict[query]
query_description_emb = model.encode(query_description, convert_to_tensor=True)
hits_description = util.semantic_search(query_description_emb, description_keys_emb)

[[{'corpus_id': 4, 'score': 1.0},
  {'corpus_id': 5, 'score': 0.9225164651870728},
  {'corpus_id': 9, 'score': 0.6423014402389526},
  {'corpus_id': 3, 'score': 0.6313684582710266},
  {'corpus_id': 7, 'score': 0.6210544109344482},
  {'corpus_id': 49, 'score': 0.6097309589385986},
  {'corpus_id': 6, 'score': 0.597919225692749},
  {'corpus_id': 8, 'score': 0.5900189876556396},
  {'corpus_id': 0, 'score': 0.5785233378410339},
  {'corpus_id': 48, 'score': 0.49412381649017334}]]

In [24]:
h = hits_param[0]
for hit in h:
    print(f"Param: {param_keys[hit['corpus_id']]} Example: {parameter_dict[param_keys[hit['corpus_id']]]}")
    print(f"Similarity: {hit['score']}")
    print("")

print("==================================")
h = hits_response[0]
for hit in h:
    print(f"Request: {response_keys[hit['corpus_id']]} Example: {response_dict[response_keys[hit['corpus_id']]]}")
    print(f"Similarity: {hit['score']}")
    print("")

print("==================================")
h = hits_description[0]
for hit in h:
    print(f"Description: {description_keys[hit['corpus_id']]} Example: {parameter_dict['['+description_to_param_dict[description_keys[hit['corpus_id']]]+']']}")
    print(f"Similarity: {hit['score']}")
    print("")

Param: [iupacextended] Example: {('glycanformatconverter', 'α-D-Manp-(1→3)[α-D-Manp-(1→6)]-β-D-Manp-(1→4)-β-D-GlcpNAc-(1→4)-β-D-GlcpNAc-(1→')}
Similarity: 0.8894085884094238

Param: [iupaccondensed] Example: {('glycanformatconverter', 'Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc(b1-')}
Similarity: 0.7328687310218811

Param: [iupac] Example: {('glyconnect', 'GlcNAc(b1-6)[Fuc(a1-2)]Gal(b1-3)GalNAc')}
Similarity: 0.6792234778404236

Param: [uniprot] Example: {('glyconnect', 'P01871')}
Similarity: 0.46696290373802185

Param: [isvalue] Example: {('unilectin', 'Plant'), ('unilectin', 'Curated'), ('unilectin', 'Bacteria%'), ('unilectin', 'Rattus norvegicus'), ('unilectin', '%Gal%'), ('unilectin', 'Sclerotinia sclerotiorum')}
Similarity: 0.45244187116622925

Param: [ensembl] Example: {('proteins', 'ENSG00000139618')}
Similarity: 0.4444567561149597

Param: [gend] Example: {('proteins', '58219305')}
Similarity: 0.4239526093006134

Param: [epitope_sequence] Example: {('proteins', 'MKTAYIAKQRQ

# load generated tools

In [7]:
import sys
def load_and_import(api_name, function_name):
    folder_path = os.path.join("..","extractor", "apidocs", api_name)
    sys.path.insert(0, folder_path)
    module = __import__(function_name)
    sys.path.pop(0)  # Clean up after import
    return module

module = load_and_import("glycanformatconverter", "convert_glycoct_to_wurcs_GET")

In [20]:
f = getattr(module, "convert_glycoct_to_wurcs")
response = f(glycoct='''RES\n1b:x-dglc-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dman-HEX-1:5\n6b:a-dman-HEX-1:5\n7b:a-dman-HEX-1:5\nLIN\n1:1d(2+1)2n\n2:1o(4+1)3d\n3:3d(2+1)4n\n4:3o(4+1)5d\n5:5o(3+1)6d\n6:5o(6+1)7d''')




# query KB and get parameter value candidates

In [34]:
def get_function(api_name, endpoint_name):
    function_name = endpoint_name.replace("_GET", "").replace("_POST", "")
    module = load_and_import(api_name, endpoint_name)
    f = getattr(module, function_name)
    return f

import random
def get_test_examples(query, api_doc_name, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5, query_description=None):
    if not query_description:
        try:
            query_description = param_to_description_dict[query]
        except:
            query_description = None
    query_emb = model.encode(query, convert_to_tensor=True) 
    hits_param = util.semantic_search(query_emb, param_keys_emb)
    hits_response = util.semantic_search(query_emb, response_keys_emb)
    if query_description:
        query_description_emb = model.encode(query_description, convert_to_tensor=True)
        hits_description = util.semantic_search(query_description_emb, description_keys_emb)
    hit_list = hits_param[0]
    all_examples = set()
    count = 0
    for hit in hit_list:
        if hit['score'] < similarity_threshold or count > max_param_examples:
            break
        param_key = param_keys[hit['corpus_id']]
        examples_set = parameter_dict[param_key]
        filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
        # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
        examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
        sidx = param_key.rfind('[')+1
        test_param = param_key[sidx:-1]
        # print(test_param, examples)
        pairs = [(test_param, example) for example in examples]
        all_examples.update(pairs)
        # print(all_examples)
        count += len(examples)
    hit_list = hits_response[0]
    count = 0
    for hit in hit_list:
        if hit['score'] < similarity_threshold or count > max_response_examples:
            break
        response_key = response_keys[hit['corpus_id']]
        examples_set = response_dict[response_key]
        examples_set_copy = examples_set.copy()
        filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
        # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
        examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
        sidx = response_key.rfind('[')+1
        test_param = response_key[sidx:-1]
        # print(test_param, examples)
        pairs = [(test_param, example) for example in examples]
        all_examples.update(pairs)
        # print(all_examples)
        count += len(examples)
    if query_description:
        hit_list = hits_description[0]
        count = 0
        for hit in hit_list:
            if hit['score'] < similarity_threshold or count > max_param_examples:
                break
            description_key = description_keys[hit['corpus_id']]
            param_key = description_to_param_dict[description_key]
            examples_set = parameter_dict['['+param_key+']']
            filted_examples = [t[1] for t in examples_set if t[0]!= api_doc_name]
            # print([(t[0],t[1]) for t in examples_set if t[0]!= api_doc_name])
            examples = random.sample(filted_examples, min(len(filted_examples), max_examples_per_hit))
            # print(param_key, examples)
            pairs = [(param_key, example) for example in examples]
            all_examples.update(pairs)
            # print(all_examples)
            count += len(examples)
    return all_examples

    

In [35]:
get_test_examples("iupacextended", "glycanformatconverter", max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)

{('8-9', 'l-1:7,11'),
 ('composition', 'example_composition_text'),
 ('glycoct',
  'RES\n1b:a-dgal-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dgal-HEX-1:5\n6b:a-lgal-HEX-1:5|6:d\nLIN\n1:1d(2+1)2n\n2:1o(3+1)3d\n3:3d(2+1)4n\n4:3o(3+1)5d\n5:3o(4+1)6d'),
 ('gws',
  'freeEnd--?a1D-GalNAc,p(--3b1D-Gal,p--??2D-NeuAc,p)--6b1D-GlcNAc,p--??1D-Gal,p--??2D-NeuAc,p}--??1Ac$MONO,Und,0,0,freeEnd'),
 ('inchi',
  'InChI=1S/C90H148N6O66/c1-21-47(116)59(128)62(131)81(142-21)140-20-40-69(55(124)43(77(135)143-40)93-24(4)108)152-78-44(94-25(5)109)56(125)66(36(16-103)148-78)153-82-63(132)72(156-86-76(61(130)51(120)33(13-100)147-86)158-80-46(96-27(7)111)58(127)68(38(18-105)150-80)155-84-65(134)74(53(122)35(15-102)145-84)162-90(88(138)139)9-29(113)42(92-23(3)107)71(160-90)49(118)31(115)11-98)54(123)39(151-82)19-141-85-75(60(129)50(119)32(12-99)146-85)157-79-45(95-26(6)110)57(126)67(37(17-104)149-79)154-83-64(133)73(52(121)34(14-101)144-83)161-89(87(136)137)8-28(112)41(91-22(2)106)70(159-89)48(1

In [36]:
import inspect
import time
import itertools
def get_parameter_names(func):
    signature = inspect.signature(func)
    return [param.name for param in signature.parameters.values()]
def run_experiment(MAX_SAMPLES=100):
    apidocs_dir = "../extractor/apidocs"
    folders = os.listdir(apidocs_dir)
    for folder in folders:
        files = os.listdir(os.path.join(apidocs_dir, folder))
        files = [x for x in files if x.endswith(".py")]
        api_extracted_json = {}
        with open(os.path.join(apidocs_dir, folder, folder + ".txt")) as f:
            api_extracted_json = json.load(f)
        extracted_endpoints = None
        try:
            extracted_endpoints = api_extracted_json['endpoints']
        except:
            print("No extracted information for api: ", folder)
            extracted_endpoints = None
            continue
        for idx, file_name in enumerate(files):
            endpoint_name = file_name.replace(".py", "")
            record_file_name = endpoint_name + "_example_test.json"
            if os.path.exists(os.path.join(apidocs_dir, folder, record_file_name)):
                endpoint_result = json.load(open(os.path.join(apidocs_dir, folder, record_file_name)))
            else:
                endpoint_result = {"endpoint": endpoint_name, "tests": [], "extracted_parameters": {}, "validated_parameters": {}}
                print(f"Testing {folder}.{endpoint_name}")
                if file_name == "__init__.py":
                    continue
                
            if extracted_endpoints:
                # find the ground truth parameters
                pass
                # endpoint_result['extracted_parameters'] = {}
                # current_endpoint = None
                # for endpoint in extracted_endpoints:
                #     current_endpoint_name = endpoint['name'].replace(' ','_').lower() + '_' + endpoint['method']
                #     if current_endpoint_name == endpoint_name:
                #         current_endpoint = endpoint
                #         print(f"Found endpoint {endpoint_name} in extracted information")
                #         break
                # if not current_endpoint:
                #     print(f"Endpoint {endpoint_name} not found in extracted information")
                #     continue
                # for req_param in current_endpoint['required_parameters']:
                #     endpoint_result['extracted_parameters'][req_param['name']] = {'description': req_param['description'], 'example': req_param['example']}
                # if current_endpoint['optional_parameters']:
                #     for opt_param in current_endpoint['optional_parameters']:
                #         endpoint_result['extracted_parameters'][opt_param['name']] = {'description': opt_param['description'], 'example': opt_param['example']}
            f = get_function(folder, endpoint_name)
            param_names = get_parameter_names(f)
            # simulate the situation that no example is provided
            # for a single parameter function, we can try a few test cases.
            if len(param_names) == 1:
                endpoint_result['tests'] = []
                curr_param = param_names[0]
                examples = get_test_examples(curr_param, folder, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)
                for test_param, example in examples:
                    try:
                        response = f(example)
                    except:
                        print(f'Error occured when calling {folder}.{endpoint_name} with {curr_param}={example}')
                        continue
                    response_json =""
                    try:
                        response_json = response.json()
                    except:
                        pass
                    example_result = {'gt_param': curr_param,'test_param': test_param, 'candidate': example, 'status_code': response.status_code, 'json': response_json, 'text': response.text}
                    endpoint_result['tests'].append(example_result)
                    time.sleep(0.1)
                pass 
            else:
                # use a rough estimation, record the ranking of the ground truth parameter(?)
                endpoint_result['tests'] = []
                example_list = []
                for param in param_names:
                    examples = get_test_examples(param, folder, max_param_examples=5, max_response_examples=5, max_examples_per_hit=1, similarity_threshold=0.5)
                    example_list.append(examples)
                candidate_combinations = list(itertools.product(*example_list))
                samples = random.sample(candidate_combinations, min(MAX_SAMPLES, len(candidate_combinations)))
                print(f"Testing {folder}.{endpoint_name} with {len(samples)} samples")
                for sample in samples:
                    try:
                        input_dict = {param_names[i]: sample[i][1] for i in range(len(param_names))}
                        response = f(**input_dict)
                    except:
                        print(f'Error occured when calling {folder}.{endpoint_name} with {input_dict}')
                        continue
                    response_json =""
                    try:
                        response_json = response.json()
                    except:
                        pass
                    example_result = {'gt_param': param_names,'test_param': [t[0] for t in sample], 'candidate': [t[1] for t in sample], 'status_code': response.status_code, 'json': response_json, 'text': response.text}
                    endpoint_result['tests'].append(example_result)
                    time.sleep(0.1)
                pass
            # save endpoint_result
            save_path = os.path.join(apidocs_dir, folder, endpoint_name + "_example_test.json")
            with open(save_path, "w") as f:
                json.dump(endpoint_result, f)



In [37]:
run_experiment()



Error occured when calling composition.convert_composition_to_wurcs__get__GET with composition=2




Error occured when calling composition.convert_composition_to_wurcs__get__GET with composition=16




Error occured when calling glycanformatconverter.convert_glycoct_to_wurcs_GET with glycoct=21932




Error occured when calling glycanformatconverter.convert_iupac_condensed_to_wurcs_GET with iupaccondensed=True




Error occured when calling glycanformatconverter.normalize_glycoct_GET with glycoct=21904




Testing glycanimage.wurcs_to_image_GET with 100 samples




Testing glycanimage.wurcs_to_image_POST with 100 samples




Testing glyconnect.convertiupactoglycoct_POST with 96 samples




Testing glyconnect.getcartoonfromsequenceglycoct_POST with 100 samples




Testing glyconnect.getcartoonfromsequencegws_POST with 100 samples




Testing glyconnect.getreleasedate_GET with 1 samples




Testing glyconnect.getreleaseinformation_GET with 1 samples




Testing glyconnect.listglycosylationsasjson_GET with 0 samples




Testing glycosmos-otherapis.partial_match_search_by_glycoql_GET with 100 samples




Error occured when calling glycosmos-otherapis.partial_match_search_by_wurcs_rdf_GET with wurcs=WURCS=2.0/8,12,11/[a2122h-1x_1-5_2*NCC/3=O][a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5][a1221m-1a_1-5][a2112h-1b_1-5][Aad21122h-2a_2-6_5*NCC/3=O][a2112h-1b_1-5_2*NCC/3=O]/1-2-3-4-2-5-6-7-4-2-8-5/a4-b1_a6-l1_b4-c1_c3-d1_c6-i1_d2-e1_e3-f1_e4-g1_i2-j1_j4-k1_g?-h2




Testing glygen.biomarker_detail_POST with 0 samples
Testing glygen.glycan_detail_POST with 0 samples




Testing glytoucandata.glytoucan_data_list_GET with 1 samples




Testing kegg.conv_GET with 70 samples




Error occured when calling kegg.conv_GET with {'target_db': 'ProteinGI,ProteinName,GeneID,GeneSymbol', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'Ensembl', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'CFG', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'PubChem Substance', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'canonicalsequences_all', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'ENSP00000351276', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'GlyTouCan', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'GlyCosmos', 'source_db': 6}




Error occured when calling kegg.conv_GET with {'target_db': 'PubChem Compound', 'source_db': 6}
Error occured when calling kegg.conv_GET with {'target_db': 'Glycosciences.de', 'source_db': 6}




Testing kegg.find_GET with 20 samples




Testing kegg.get_GET with 0 samples




Testing kegg.link_GET with 63 samples




Error occured when calling kegg.link_GET with {'target_db': 'ChEBI', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'ProteinGI,ProteinName,GeneID,GeneSymbol', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'CarbBank', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'ENSP00000351276', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'GlyTouCan', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'Ensembl', 'source_db': 201}
Error occured when calling kegg.link_GET with {'target_db': 'CFG', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'canonicalsequences_all', 'source_db': 201}




Error occured when calling kegg.link_GET with {'target_db': 'GlycoEpitope', 'source_db': 201}




Testing proteins.get_genome_coordinate_by_protein_sequence_position_GET with 100 samples




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Blood Serum', 'gPosition': '1'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Ceruloplasmin', 'gPosition': '1'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'CHO', 'gPosition': '1'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'RES\n1b:b-dglc-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dman-HEX-1:5\n6b:a-dman-HEX-1:5\n7b:a-dman-HEX-1:5\nLIN\n1:1d(2+1)2n\n2:1o(4+1)3d\n3:3d(2+1)4n\n4:3o(4+1)5d\n5:5o(3+1)6d\n6:5o(6+1)7d\n', 'gPosition': 'freeEnd--?b1D-GlcNAc,p--4b1D-GlcNAc,p--4b1D-Man,p--?a1D-Man,p$MONO,Und,0,0,freeEnd'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Ceruloplasmin', 'gPosition': '1'}
Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'CHO', 'gPosition': '0'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Blood Serum', 'gPosition': '5,4,2,1,0,0,0,0,0,0,0,0,0,0'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Lymphocyte', 'gPosition': '0'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Blood Serum', 'gPosition': 'freeEnd--?b1D-GlcNAc,p--4b1D-GlcNAc,p--4b1D-Man,p--?a1D-Man,p$MONO,Und,0,0,freeEnd'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Blood Serum', 'gPosition': '1'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'RES\n1b:b-dglc-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dman-HEX-1:5\n6b:a-dman-HEX-1:5\n7b:a-dman-HEX-1:5\nLIN\n1:1d(2+1)2n\n2:1o(4+1)3d\n3:3d(2+1)4n\n4:3o(4+1)5d\n5:5o(3+1)6d\n6:5o(6+1)7d\n', 'gPosition': '1'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'RES\n1b:b-dglc-HEX-1:5\n2s:n-acetyl\n3b:b-dglc-HEX-1:5\n4s:n-acetyl\n5b:b-dman-HEX-1:5\n6b:a-dman-HEX-1:5\n7b:a-dman-HEX-1:5\nLIN\n1:1d(2+1)2n\n2:1o(4+1)3d\n3:3d(2+1)4n\n4:3o(4+1)5d\n5:5o(3+1)6d\n6:5o(6+1)7d\n', 'gPosition': '5,4,2,1,0,0,0,0,0,0,0,0,0,0'}




Error occured when calling proteins.get_genome_coordinate_by_protein_sequence_position_GET with {'taxonomy': 10116, 'chromosome': 'Lymphocyte', 'gPosition': '5,4,2,1,0,0,0,0,0,0,0,0,0,0'}




Testing proteins.get_genome_coordinate_by_protein_sequence_position_range_GET with 0 samples




Testing proteins.search_antigens_in_uniprot_GET with 0 samples
Testing proteins.search_epitope_in_uniprot_GET with 0 samples
Testing proteins.search_genomic_coordinates_for_uniprot_entries_GET with 0 samples
Testing proteins.search_protein_sequence_features_in_uniprot_GET with 0 samples
Testing proteins.search_uniprot_entries_by_genomic_database_cross_reference_ids_GET with 0 samples
Testing proteins.search_uniprot_entries_by_taxonomy_and_genomic_coordinates_GET with 0 samples
Testing proteins.search_uniprot_entries_by_taxonomy_and_genomic_coordinates_with_feature_GET with 0 samples
Testing pubchem.assay_description_GET with 0 samples
Testing pubchem.assay_dose_response_GET with 0 samples
Testing pubchem.assay_summary_GET with 0 samples
Testing pubchem.assay_targets_GET with 0 samples
Testing pubchem.cell_line_summary_GET with 0 samples
Testing pubchem.classification_GET with 0 samples
Testing pubchem.compound_property_tables_GET with 0 samples
Testing pubchem.conformers_GET with 0 sam



Testing unilectin.get_biotech_lectins_POST with 0 samples
Testing unilectin.get_fungal_lectins_POST with 0 samples
Testing unilectin.get_human_lectome_POST with 0 samples
Testing unilectin.get_lectins_POST with 0 samples
Testing unilectin.get_ligands_POST with 0 samples
Testing unilectin.get_predicted_lectins_POST with 0 samples
Testing unilectin.get_propeller_lectins_POST with 0 samples
Testing unilectin.get_trefoil_lectins_POST with 0 samples




Testing WURCS-RDF.wurcs_to_wurcs_rdf_GET with 100 samples




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'WURCS=2.0/2,5,4/[axxxxh-1x_1-5_2*NCC/3=O][axxxxh-1x_1-5]/1-1-2-2-2/a?|b?|c?|d?|e?'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'freeEnd--?a1D-GalNAc,p(--3b1D-Gal,p--??2D-NeuAc,p)--6b1D-GlcNAc,p--??1D-Gal,p--??2D-NeuAc,p}--??1Ac$MONO,Und,0,0,freeEnd'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': '{"Composition":{},"WURCS":"","Aglycone":"","Fragments":{},"Repeat":{},"Edges":{"e0":{"Acceptor":{"Position":[4],"Node":"m0","LinkageType":"H_AT_OH"},"Donor":{"Position":[1],"Node":"m1","LinkageType":"DEOXY"},"Probability":{"High":1,"Low":1}},"e1":{"Acceptor":{"Position":[4],"Node":"m1","LinkageType":"H_AT_OH"},"Donor":{"Position":[1],"Node":"m2","LinkageType":"DEOXY"},"Probability":{"High":1,"Low":1}},"e2":{"Acceptor":{"Position":[3],"Node":"m2","LinkageType":"H_AT_OH"},"Donor":{"Position":[1],"Node":"m3","LinkageType":"DEOXY"},"Probability":{"High":1,"Low":1}},"e3":{"Acceptor":{"Position":[6],"Node":"m2","LinkageType":"H_AT_OH"},"Donor":{"Position":[1],"Node":"m4","LinkageType":"DEOXY"},"Probability":{"High":1,"Low":1}}},"AN":"","Bridge":{},"Monosaccharides":{"m0":{"Modifications":[],"TrivialName":["glc"],"Substituents":[{"Status":"simple","Acceptor":{"Position":[2],"LinkageType":"DEOXY"},"Donor



Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'WURCS=2.0/3,5,4/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5]/1-1-2-3-3/a4-b1_b4-c1_c3-d1_c6-e1'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'WURCS=2.0/3,4,3/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5]/1-1-2-3/a4-b1_b4-c1_c?-d1'}
Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'freeEnd--1b1D-GlcNAc,p(--4b1D-GlcNAc,p--4b1D-Man,p(--3a1D-Man,p--2b1D-GlcNAc,p--4b1D-Gal,p--3a2D-NeuAc,p)--6a1D-Man,p--2b1D-GlcNAc,p--4b1D-Gal,p--3a2D-NeuAc,p)--6a1L-Fuc,p$MONO,Und,0,0,freeEnd'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'freeEnd--?b1D-GlcNAc,p--4b1D-GlcNAc,p--4b1D-Man,p--?a1D-Man,p$MONO,Und,0,0,freeEnd'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'example_composition_text'}




Error occured when calling WURCS-RDF.wurcs_to_wurcs_rdf_GET with {'gtcid': 2244, 'wurcs': 'WURCS=2.0/6,12,11/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5][a1221m-1a_1-5][a2112h-1b_1-5][Aad21122h-2a_2-6_5*NCC/3=O]/1-1-2-3-1-4-5-3-1-5-6-4/a4-b1_a6-l1_b4-c1_c3-d1_c6-h1_d2-e1_e3-f1_e4-g1_h2-i1_i4-j1_j?-k2'}




# let gpt evaluate the tested examples
1. check if status code is 200
2. chunk the response
3. sent response to gpt, let it check if the API returns an actual response rather than an error information

In [38]:
from langchain.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
tagging_prompt = ChatPromptTemplate.from_template(
    """
Decide if the following API response is an information or an error message.

API Description:
{description}
API Response:
{response}
"""
)
class Classification(BaseModel):
    response_type: str= Field(..., enum=['information', 'error'])
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini").with_structured_output(Classification)

def gpt_evaluate(API_description:str, API_response: str):
    '''let gpt to decide if the API response is a piece of information or an error message'''
    prompt = tagging_prompt.invoke({"description": API_description, "response": API_response})
    gpt_response = llm.invoke(prompt)
    return gpt_response.response_type

In [None]:
import os
import json
apidocs_dir = "../extractor/apidocs"
folders = os.listdir(apidocs_dir)
count_single_param_files = 0
count_multi_param_files = 0
count_validated_files = 0
count_validated_single_param_files = 0
count_validated_multi_param_files = 0
count_find_by_scemantic = 0
count_find_by_scemantic_single = 0
count_find_by_scemantic_multi = 0
scemantic_case = []
for folder in folders:
    files = os.listdir(os.path.join(apidocs_dir, folder))
    files = [x for x in files if x.endswith("_example_test.json")]
    for file_name in files:
        file_path = os.path.join(apidocs_dir, folder, file_name)
        # if json file is larger than 10MB, skip it
        if os.path.getsize(file_path) > 100*1024*1024:
            continue
        endpoint_result = json.load(open(file_path))
        endpoint_result['validated_parameters'] = {}
        print(f"Validating {folder}.{endpoint_result['endpoint']}")
        saved_error_responses = set() # API responses usually have similar error messages, save them to avoid repeated evaluation
        if endpoint_result['tests']:
            if type(endpoint_result['tests'][0]['gt_param']) == str:
                count_single_param_files += 1
            else:
                count_multi_param_files += 1
        for test in endpoint_result['tests']:
            status_code = test['status_code']
            if status_code == 200:
                response_text = test['text']
                # truncate the response text to avoid the max token limit of gpt
                response_text = response_text[:500] if len(response_text) > 500 else response_text
                if response_text in saved_error_responses:
                    continue
                else:
                    # if type(test['gt_param']) == str:
                    #     continue
                    gpt_response = gpt_evaluate(endpoint_result['endpoint'],response_text)
                    if gpt_response == 'error':
                        print(f"Error response detected: {response_text}")
                        saved_error_responses.add(response_text)
                    else:
                        print(f"Information response detected: {response_text}")
                        count_validated_files += 1
                        if type(test['gt_param']) == str:
                            endpoint_result['validated_parameters'][test['gt_param']] = test['candidate']
                            count_validated_single_param_files += 1
                            if not test['test_param'] in endpoint_result['extracted_parameters']:
                                count_find_by_scemantic += 1
                                count_find_by_scemantic_single += 1
                                scemantic_case.append((folder, endpoint_result['endpoint'], test['test_param'], test['gt_param']))
                        else:
                            count_validated_multi_param_files += 1
                            for i, gt_param in enumerate(test['gt_param']):
                                endpoint_result['validated_parameters'][gt_param] = test['candidate'][i]
                                if not test['test_param'][i] in endpoint_result['extracted_parameters']:
                                    count_find_by_scemantic += 1
                                    count_find_by_scemantic_multi += 1
                                    scemantic_case.append((folder, endpoint_result['endpoint'], test['test_param'][i], gt_param))
                        break
        with open(file_path, "w") as f:
            json.dump(endpoint_result, f)
print(f"Validated {count_validated_files} parameters in {count_multi_param_files+count_single_param_files} files. {count_find_by_scemantic} parameters are found by semantic search.") 
print(f"Validated {count_validated_single_param_files} single parameters in {count_single_param_files} files. {count_find_by_scemantic_single} single parameters are found by semantic search.")
print(f"Validated {count_validated_multi_param_files} multi parameters in {count_multi_param_files} files. {count_find_by_scemantic_multi} multi parameters are found by semantic search.")
print(scemantic_case)

            

Validating composition.convert_composition_to_wurcs__get__GET
Validating composition.convert_composition_to_wurcs__post__POST
Validating glycam.build_structure_via_url_GET
Information response detected: <!DOCTYPE html>




<html lang="en">
  <head>
      <meta charset="UTF-8">
      <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=20">
      <meta name="description" content="Utilities for molecular modeling of carbohydrates.">
      <meta 
        name="keywords" 
        content="GLYCAM, GLYCAM-Web, Carbohydrates, Oligosaccharide, UGA, CCRC, Complex Carbohydrates, Molecular Modeling"
      >
      <title>
        GLYCAM-Web | Utilities for molecular modeling of
Validating glycanformatconverter.convert_glycoct_to_wurcs_GET
Error response detected: {"message":"org.eurocarbdb.MolecularFramework.io.SugarImporterException"}
Information response detected: {"id":"G03717EM","wurcs":"WURCS=2.0/3,4,3/[a2122h-1b_1-5_2*NCC/3=O][a1122h-1b_1-5][a1122h-1a_1-5]/1-1-2-

: 