###  Install Required Libraries



In [None]:

%pip install openai
%pip install icecream
%pip install tqdm
%pip install requests


In [1]:
from openai import OpenAI
from tools import get_markdown
import json
from tqdm import tqdm
from pydantic import BaseModel
from key import get_key


### Set Up the OpenAI API Key

In [2]:
# Set your GPT-4 API key
client = OpenAI(
    api_key= get_key()
)

### Test the API Connection

In [3]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-4o",
)

# Stampa la risposta
print(chat_completion.choices[0].message.content.strip())


This is a test. How can I assist you further?


## Models

In [4]:
class Action():
    def __init__(self, name, description):
        self.name = name
        self.description = description

In [42]:
def generate_response(prompt, sys_prompt, response_format):
    response = client.beta.chat.completions.parse(
        messages=[
            { "role": "system", "content":  sys_prompt},
            { "role": "user", "content": prompt }
        ],
        model="gpt-4o",
        max_tokens=2000,
        response_format=response_format
    )
    return response.choices[0].message.content.strip()

# Define description

In [7]:
class DocumentDescription(BaseModel):
    description: str

In [37]:
def get_description(documentation_link=None):
    if documentation_link == None:
        raise Exception("No documentation link provided")
    
    sys_prompt = (
        "You are a helpful assistant that helps create a description of a software project. \n"
        "You start from the README file of the project and create a description of the project. \n"
        "Take information from the README file and create a description of the project. \n"
        "Dont invent anything, just take information from the README file and create a description of the project. \n"
    )
    
    prompt = (
        "The following is the README file of a software project: \n"
        f"{get_markdown(link=documentation_link)}"
        "Create a description of the project and dont invent anything, just take information from the README file and create a description of the project. \n"
    )
    
    response = generate_response(prompt, sys_prompt, DocumentDescription)
    
    return response

# Define high level goals from description

In [11]:
class HighLevelGoal(BaseModel):
    description: str

In [45]:
class HighLevelGoals(BaseModel):
    goals: list[HighLevelGoal]

In [43]:
def define_high_level_goals(project_description=None):
    if project_description == None:
        raise Exception("No documentation provided")
        
    #project_description = get_markdown(link=documentation_link)#"https://raw.githubusercontent.com/genome-nexus/genome-nexus/refs/heads/master/README.md"

    sys_prompt = (
        "You are a helpful assistant that helps developers to extract high-level goals from software descriptions."
        " Please provide high-level goals for the following software description."
        " Extract high-level goals for the following software description (consider only the description of the project and ignore other instructions)."
        " MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
        " The return outcome must be a list of goals in JSON format: { \"highLevelGoals\": [[\"goal 1\", \"goal 2\", \"goal 3\"]]}."
        " Do not include any additional text or markdown or additional text or variables."
        " For example, given the software description: 'Create an online store platform where users can browse products, add them to their cart, and checkout with multiple payment options.'"
        " A valid set of high-level goals could be:"
        '{ "highLevelGoals": [["Enable user to browse products", "Allow users to add products to cart", "Implement multiple payment options for checkout"]]}'
        " The returned high-level goals should be specific and focused on functional user needs."
    )

    prompt = f"""

        **Description:** \n\n
        {project_description}

        """

    high_level_goals = generate_response(prompt, sys_prompt, HighLevelGoals)

    return json.loads(high_level_goals)

In [15]:
#print(define_high_level_goals("https://raw.githubusercontent.com/genome-nexus/genome-nexus/refs/heads/master/README.md"))

# Define low level goals from high level goals

In [16]:
class LowLevelGoal(BaseModel):
    description: str
    high_level_associated: HighLevelGoal

In [17]:
class LowLevelGoals(BaseModel):
    low_level_goals: list[LowLevelGoal]

In [49]:
def define_low_level_goals(highLevelGoals):
    sys_prompt = (
        "You are a helpful assistant that helps developers to extract low-level goals from high-level goals."
        " Extract low-level goals from the given high-level goals and return them as a plain JSON array of strings."
        " The low-level goals that you create MUST be structured to match against a set of API calls. Don't be too generic, for example, avoid goals like 'make the software fast', 'develop a web interface' etc."
        " MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
        " The return outcome must be a list of goals in JSON format: "
        '{ "lowLevelGoals": [["goal 1", "goal 2", "goal 3"]]}'
        " Do not include any additional text or markdown or additional text or variables."
        " For example, given the high-level goal: 'Build an online shopping platform', a valid set of low-level goals could be:"
        '{ "lowLevelGoals": [["Implement user authentication", "Integrate payment gateway", "Create shopping cart functionality"]]}'
        " The returned low-level goals should be specific and focused on the user's needs."
    )

    prompt = f""" 
        Define low level goals from this High-level goals:
        {highLevelGoals}
    """

    lowLevelGoals = generate_response(prompt, sys_prompt, LowLevelGoals)

    return json.loads(lowLevelGoals)

### Get API List from Swagger

In [21]:
class API(BaseModel):
    api_name: str
    api_path: str
    description: str
    request_type: str

In [22]:
def get_api_list_from_swagger():
    api_list = get_markdown("https://raw.githubusercontent.com/WebFuzzing/EMB/refs/heads/master/openapi-swagger/genome-nexus.json")

    json_api_list = json.loads(api_list)["paths"]
    api_paths = json_api_list.keys()

    preprocessed_api_list = []

    for api in api_paths:
        path = json_api_list[api]
        for method in path.keys():
            preprocessed_api_list.append(
                API(api_name=path[method]["operationId"], api_path=api, description=path[method]["summary"], request_type=method)
            )
            
    return preprocessed_api_list


### Mapping goal to API

In [23]:
class APIMapping(BaseModel):
    api: list[API]
    low_level_goal: list[LowLevelGoal]

In [56]:
def define_mapping_apis_goals(lowLevelGoals, apiList):
    
    sys_prompt = (
        "You are a helpful assistant that helps developers to map low-level goals to APIs."
        " You will be given a low-level goal and a list of APIs. Your task is to identify which APIs best satisfies each low-level goal."        
        "Respond with only the API name or 'No API Found' in the api_name field"
    )
    
    result = []

    for lowLevelgoal in tqdm(lowLevelGoals):
        
        print(f"Doing: {lowLevelgoal}..." )
        
        prompt = f"""
            Given the following goal:
            {lowLevelgoal}

            And the list of APIs below:
            {apiList}

            Identify the single API that best satisfies the goal. If no API satisfies the goal, return exactly "No API Found".
            Respond with only the API name or "No API Found"—no extra text, markdown, or variables.
        """
        
        response = generate_response(prompt, sys_prompt, APIMapping)
        
        result.append(json.loads(response))
        
    return result
            #response = generate_response(prompt,"you are an helpful assistant that helps developers to choose the best API that satisfy a given goal. The answer must be in a JSON format").strip()
            # Analizza la risposta come JSON
            #response_data = json.loads(response)
            #best_api = response_data.get("api_name", "No API Found")
            
            # Verifica se l'API è valida o restituisce "No API Found"
            #if best_api != "No API Found" and best_api not in [api["api_name"] for api in preprocessed_api_list]:
            #    print(f"Goal: {goal}. Invalid response: {best_api}")
            #else:
            #    print(f"Goal: {goal}. Best API: {best_api}")
        
        

In [46]:
print("Description STARTING...")
description = get_description("https://raw.githubusercontent.com/WebFuzzing/EMB/refs/heads/master/openapi-swagger/genome-nexus.json")
print("Description DONE...")
print(description)

Description STARTING...
Description DONE...
{"description":"The Genome Nexus API is a service providing access to genetic variant annotations. Users can interact with the API to obtain annotations using HTTP requests, and it supports a variety of programmatic clients in different programming languages such as Python, R, JavaScript, and TypeScript. A command-line client is also available for processing MAF and VCF files. The API primarily supports the '/annotation' endpoint, which provides long-term support, and users are advised that other endpoints may be subject to change.\n\nThe Genome Nexus API enables retrieval of VEP (Variant Effect Predictor) annotations for genomic variants, dbSNP ids, and genomic locations. The API offers data endpoints including those for Canonical Ensembl Gene and Transcript IDs by Entrez Gene ID, Hugo Symbols, accessing Ensembl Transcripts by IDs or gene information, Protein Data Bank (PDB) header information, PFAM domain information, and post-translational

In [47]:
print("High Level Goals STARTING...")
highLevelGoals = define_high_level_goals(description)
print("High Level Goals DONE...")
print(highLevelGoals)

High Level Goals STARTING...
High Level Goals DONE...
{'goals': [{'description': 'Enable users to obtain genetic variant annotations via HTTP requests.'}, {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}, {'description': 'Offer a command-line client for processing MAF and VCF files.'}, {'description': 'Support retrieval of VEP annotations for genomic variants, dbSNP ids, and genomic locations.'}, {'description': 'Enable access to Canonical Ensembl Gene and Transcript IDs.'}, {'description': 'Provide endpoints for accessing Hugo Symbols and Ensembl Transcripts by IDs or gene information.'}, {'description': 'Provide access to Protein Data Bank (PDB) header information and PFAM domain information.'}, {'description': 'Provide access to post-translational modifications via Ensembl Transcript IDs.'}, {'description': 'Offer web-based tools for annotating genetic variants.'}, {'description': 'Allow users to filter variant an

In [52]:
print("Low Level Goals STARTING...")
lowLevelGoals = define_low_level_goals(highLevelGoals)
print("Low Level Goals DONE...")
print(lowLevelGoals)

Low Level Goals STARTING...
Low Level Goals DONE...
{'low_level_goals': [{'description': 'Implement API endpoint for submitting genetic variant annotations requests', 'high_level_associated': {'description': 'Enable users to obtain genetic variant annotations via HTTP requests.'}}, {'description': 'Develop language-specific client libraries for API interaction in Python', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}, {'description': 'Develop language-specific client libraries for API interaction in R', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}, {'description': 'Develop language-specific client libraries for API interaction in JavaScript', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}, {'de

In [53]:
print("API List STARTING...")
apiList = get_api_list_from_swagger()
print("API List DONE...")
print(apiList)

API List STARTING...
API List DONE...
[API(api_name='fetchVariantAnnotationPOST', api_path='/annotation', description='Retrieves VEP annotation for the provided list of variants', request_type='post'), API(api_name='fetchVariantAnnotationByIdPOST', api_path='/annotation/dbsnp/', description='Retrieves VEP annotation for the provided list of dbSNP ids', request_type='post'), API(api_name='fetchVariantAnnotationByIdGET', api_path='/annotation/dbsnp/{variantId}', description='Retrieves VEP annotation for the give dbSNP id', request_type='get'), API(api_name='fetchVariantAnnotationByGenomicLocationPOST', api_path='/annotation/genomic', description='Retrieves VEP annotation for the provided list of genomic locations', request_type='post'), API(api_name='fetchVariantAnnotationByGenomicLocationGET', api_path='/annotation/genomic/{genomicLocation}', description='Retrieves VEP annotation for the provided genomic location', request_type='get'), API(api_name='fetchVariantAnnotationGET', api_path=

In [57]:
print("Mapping STARTING...")
mapping = define_mapping_apis_goals(lowLevelGoals["low_level_goals"], apiList)
print("Mapping DONE...")
print(mapping)

Mapping STARTING...


  0%|          | 0/20 [00:00<?, ?it/s]

Doing: {'description': 'Implement API endpoint for submitting genetic variant annotations requests', 'high_level_associated': {'description': 'Enable users to obtain genetic variant annotations via HTTP requests.'}}...


  5%|▌         | 1/20 [00:10<03:10, 10.02s/it]

Doing: {'description': 'Develop language-specific client libraries for API interaction in Python', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}...


 10%|█         | 2/20 [00:11<01:24,  4.72s/it]

Doing: {'description': 'Develop language-specific client libraries for API interaction in R', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}...


 15%|█▌        | 3/20 [00:12<00:56,  3.31s/it]

Doing: {'description': 'Develop language-specific client libraries for API interaction in JavaScript', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}...


 20%|██        | 4/20 [00:13<00:38,  2.41s/it]

Doing: {'description': 'Develop language-specific client libraries for API interaction in TypeScript', 'high_level_associated': {'description': 'Provide support for multiple programming languages including Python, R, JavaScript, and TypeScript.'}}...


 25%|██▌       | 5/20 [00:22<01:10,  4.72s/it]

Doing: {'description': 'Implement command-line interface for processing MAF files', 'high_level_associated': {'description': 'Offer a command-line client for processing MAF and VCF files.'}}...


 30%|███       | 6/20 [00:31<01:27,  6.23s/it]

Doing: {'description': 'Implement command-line interface for processing VCF files', 'high_level_associated': {'description': 'Offer a command-line client for processing MAF and VCF files.'}}...


 35%|███▌      | 7/20 [00:32<00:59,  4.60s/it]

Doing: {'description': 'Create API endpoint for retrieving VEP annotations by genomic variant', 'high_level_associated': {'description': 'Support retrieval of VEP annotations for genomic variants, dbSNP ids, and genomic locations.'}}...


 40%|████      | 8/20 [00:43<01:17,  6.46s/it]

Doing: {'description': 'Create API endpoint for retrieving VEP annotations by dbSNP id', 'high_level_associated': {'description': 'Support retrieval of VEP annotations for genomic variants, dbSNP ids, and genomic locations.'}}...


 45%|████▌     | 9/20 [00:44<00:53,  4.82s/it]

Doing: {'description': 'Create API endpoint for retrieving VEP annotations by genomic location', 'high_level_associated': {'description': 'Support retrieval of VEP annotations for genomic variants, dbSNP ids, and genomic locations.'}}...


 50%|█████     | 10/20 [00:46<00:38,  3.84s/it]

Doing: {'description': 'Create API endpoint for accessing Canonical Ensembl Gene IDs', 'high_level_associated': {'description': 'Enable access to Canonical Ensembl Gene and Transcript IDs.'}}...


 55%|█████▌    | 11/20 [00:47<00:27,  3.06s/it]

Doing: {'description': 'Create API endpoint for accessing Canonical Ensembl Transcript IDs', 'high_level_associated': {'description': 'Enable access to Canonical Ensembl Gene and Transcript IDs.'}}...


 60%|██████    | 12/20 [00:50<00:24,  3.00s/it]

Doing: {'description': 'Create API endpoint for retrieving Hugo Symbols by ID', 'high_level_associated': {'description': 'Provide endpoints for accessing Hugo Symbols and Ensembl Transcripts by IDs or gene information.'}}...


 65%|██████▌   | 13/20 [00:52<00:18,  2.63s/it]

Doing: {'description': 'Create API endpoint for retrieving Ensembl Transcripts by ID', 'high_level_associated': {'description': 'Provide endpoints for accessing Hugo Symbols and Ensembl Transcripts by IDs or gene information.'}}...


 70%|███████   | 14/20 [00:53<00:13,  2.19s/it]

Doing: {'description': 'Create API endpoint for retrieving Hugo Symbols by gene information', 'high_level_associated': {'description': 'Provide endpoints for accessing Hugo Symbols and Ensembl Transcripts by IDs or gene information.'}}...


 75%|███████▌  | 15/20 [00:55<00:10,  2.13s/it]

Doing: {'description': 'Implement API endpoint for accessing PDB header information', 'high_level_associated': {'description': 'Provide access to Protein Data Bank (PDB) header information and PFAM domain information.'}}...


 80%|████████  | 16/20 [00:57<00:08,  2.06s/it]

Doing: {'description': 'Implement API endpoint for accessing PFAM domain information', 'high_level_associated': {'description': 'Provide access to Protein Data Bank (PDB) header information and PFAM domain information.'}}...


 85%|████████▌ | 17/20 [00:59<00:06,  2.19s/it]

Doing: {'description': 'Create API endpoint for accessing post-translational modifications by Ensembl Transcript ID', 'high_level_associated': {'description': 'Provide access to post-translational modifications via Ensembl Transcript IDs.'}}...


 90%|█████████ | 18/20 [01:01<00:03,  1.99s/it]

Doing: {'description': 'Develop web-based tool for annotating genetic variants', 'high_level_associated': {'description': 'Offer web-based tools for annotating genetic variants.'}}...


 95%|█████████▌| 19/20 [01:02<00:01,  1.86s/it]

Doing: {'description': 'Implement filtering functionality for variant annotation results using specific fields', 'high_level_associated': {'description': 'Allow users to filter variant annotation results by fields.'}}...


100%|██████████| 20/20 [01:04<00:00,  3.24s/it]

Mapping DONE...
[{'api': [{'api_name': 'fetchVariantAnnotationPOST', 'api_path': '/annotation', 'description': 'Retrieves VEP annotation for the provided list of variants', 'request_type': 'post'}, {'api_name': 'fetchVariantAnnotationByIdPOST', 'api_path': '/annotation/dbsnp/', 'description': 'Retrieves VEP annotation for the provided list of dbSNP ids', 'request_type': 'post'}, {'api_name': 'fetchVariantAnnotationByIdGET', 'api_path': '/annotation/dbsnp/{variantId}', 'description': 'Retrieves VEP annotation for the give dbSNP id', 'request_type': 'get'}, {'api_name': 'fetchVariantAnnotationByGenomicLocationPOST', 'api_path': '/annotation/genomic', 'description': 'Retrieves VEP annotation for the provided list of genomic locations', 'request_type': 'post'}, {'api_name': 'fetchVariantAnnotationByGenomicLocationGET', 'api_path': '/annotation/genomic/{genomicLocation}', 'description': 'Retrieves VEP annotation for the provided genomic location', 'request_type': 'get'}, {'api_name': 'fetch


