###  Install Required Libraries



In [82]:

%pip install openai
%pip install icecream
%pip install tqdm
%pip install requests


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [83]:
from openai import OpenAI
from tools import get_markdown
import json
from tqdm import tqdm
from key import get_key


### Set Up the OpenAI API Key

In [84]:
# Set your GPT-4 API key
client = OpenAI(
    api_key= get_key()
)

### Test the API Connection

In [85]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-4o",
)

# Stampa la risposta
print(chat_completion.choices[0].message.content.strip())


This is a test.


In [86]:
def generate_response(prompt, sys_prompt):
    response = client.chat.completions.create(
        messages=[
            { "role": "system", "content":  sys_prompt},
            { "role": "user", "content": prompt }
        ],
        model="gpt-4o",
        max_tokens=500,
        response_format={ "type": "json_object" }
    )
    return response.choices[0].message.content.strip()

In [87]:
project_description = get_markdown(link="https://raw.githubusercontent.com/genome-nexus/genome-nexus/refs/heads/master/README.md")

sys_prompt = (  "You are an helpful assistant that helps developers to extract high-level goals from software descriptions."
                "Please provide a high-level goals for the following software description."
                "Extract high-level goals for the following software description (consider only the description of the project and ignore other instructions"
                "MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
                "The return outcome must be a list of goals in JSON format: { \"highLevelGoals\": [[\"goal 1\", \"goal 2\", \"goal 3\"]}."
            )

prompt = f"""

**Description:**
{project_description}

"""

high_level_goals = generate_response(prompt, sys_prompt)
print(high_level_goals)

{
  "highLevelGoals": [
    "Enable fast and automated annotation of genetic variants in cancer.",
    "Provide high-throughput interpretation of genetic variants using integrated resources.",
    "Convert DNA changes to protein changes and predict functional effects of protein mutations.",
    "Provide information about mutation frequencies and gene functions.",
    "Assess the clinical actionability of genetic variants.",
    "Support multiple input formats for variant annotation requests.",
    "Offer a user interface for browsing annotated genetic variants.",
    "Support both human and mouse genomic analyses."
  ]
}


In [88]:
sys_prompt = (
    "You are an helpful assistant that helps developers to extract low-level goals from high-level goals."
    "Extract low-level goals from these high-level goals and return them as a plain JSON array of strings."
    "The low-level goals that you create MUST be structured to match against a set of API calls. Dont be too generic, for example, avoid goals like 'make the software fast', 'develop a web interface' etc."
    "MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
    "The return outcome must be a list of goals in JSON format: "
    "{ \"lowLevelGoals\": [[\"goal 1\", \"goal 2\", \"goal 3\"]}. Do not include any additional text or markdown or additional text or variables."
)

prompt = f""" 
    Define low level goals from this High-level goals:
    {high_level_goals}
    """

low_level_goals = generate_response(prompt, sys_prompt)
print(low_level_goals)

{ "lowLevelGoals": [
    "Implement automated pipeline for genetic variant annotation.",
    "Integrate multiple databases for variant interpretation.",
    "Develop algorithm to translate DNA changes to protein changes.",
    "Implement prediction model for functional effects of protein mutations.",
    "Integrate mutation frequency data into the system.",
    "Provide detailed gene function information.",
    "Implement framework to assess clinical actionability of variants.",
    "Support VCF format for variant annotation input.",
    "Support BED format for variant annotation input.",
    "Create user interface for browsing annotated genetic variants.",
    "Implement support for human genome data analysis.",
    "Implement support for mouse genome data analysis."
  ]
}


In [89]:
json_goals = json.loads(low_level_goals)["lowLevelGoals"]

print(len(json_goals))

12


### Get API List from Swagger

In [90]:
api_list = get_markdown("https://raw.githubusercontent.com/WebFuzzing/EMB/refs/heads/master/openapi-swagger/genome-nexus.json")

json_api_list = json.loads(api_list)["paths"]
api_paths = json_api_list.keys()

preprocessed_api_list = []

for api in api_paths:
    path = json_api_list[api]
    for method in path.keys():
        preprocessed_api_list.append({
            "api_name": path[method]["operationId"],
            "api_path": api,
            "description": path[method]["summary"],
            "request_type": method
        })


In [91]:
for goal in json_goals:
    print(goal)

Implement automated pipeline for genetic variant annotation.
Integrate multiple databases for variant interpretation.
Develop algorithm to translate DNA changes to protein changes.
Implement prediction model for functional effects of protein mutations.
Integrate mutation frequency data into the system.
Provide detailed gene function information.
Implement framework to assess clinical actionability of variants.
Support VCF format for variant annotation input.
Support BED format for variant annotation input.
Create user interface for browsing annotated genetic variants.
Implement support for human genome data analysis.
Implement support for mouse genome data analysis.


### Mapping goal to API

In [92]:
goal

'Implement support for mouse genome data analysis.'

In [93]:
print(f"Goal: {goal}")
print(f"len", str(api_list))

Goal: Implement support for mouse genome data analysis.
len {
  "swagger": "2.0",
  "info": {
    "description": "This page shows how to use HTTP requests to access the Genome Nexus API. There are more high level clients available in Python, R, JavaScript, TypeScript and various other languages as well as a command line client to annotate MAF and VCF. See https://docs.genomenexus.org/api.\n\nAside from programmatic clients there are web based tools to annotate variants, see https://docs.genomenexus.org/tools.\n\n We currently only provide long-term support for the '/annotation' endpoint. The other endpoints might change.",
    "version": "2.0",
    "title": "Genome Nexus API",
    "license": {
      "name": "MIT License",
      "url": "https://github.com/genome-nexus/genome-nexus/blob/master/LICENSE"
    }
  },
  "host": "localhost:28125",
  "basePath": "/",
  "tags": [
    {
      "name": "info-controller",
      "description": "Info Controller"
    },
    {
      "name": "pdb-control

In [94]:
for goal in tqdm(json_goals):
    prompt = f"""
        Given the following goal:
        {goal}

        And the list of APIs below:
        {preprocessed_api_list}

        Identify the single API that best satisfies the goal. If no API satisfies the goal, return exactly "No API Found".
        Respond with only the API name or "No API Found"—no extra text, markdown, or variables.
    """
    
    try:
        response = generate_response(prompt,"you are an helpful assistant that helps developers to choose the best API that satisfy a given goal. The answer must be in a JSON format").strip()
        # Analizza la risposta come JSON
        response_data = json.loads(response)
        best_api = response_data.get("api_name", "No API Found")
        
        # Verifica se l'API è valida o restituisce "No API Found"
        if best_api != "No API Found" and best_api not in [api["api_name"] for api in preprocessed_api_list]:
            print(f"Goal: {goal}. Invalid response: {best_api}")
        else:
            print(f"Goal: {goal}. Best API: {best_api}")
    
    except Exception as e:
        print(f"Error occurred for goal '{goal}': {e}")

  8%|▊         | 1/12 [00:00<00:06,  1.77it/s]

Goal: Implement automated pipeline for genetic variant annotation.. Best API: fetchVariantAnnotationPOST


 17%|█▋        | 2/12 [00:01<00:06,  1.61it/s]

Goal: Integrate multiple databases for variant interpretation.. Best API: No API Found


 25%|██▌       | 3/12 [00:01<00:06,  1.50it/s]

Goal: Develop algorithm to translate DNA changes to protein changes.. Best API: fetchVariantAnnotationPOST


 33%|███▎      | 4/12 [00:02<00:05,  1.50it/s]

Goal: Implement prediction model for functional effects of protein mutations.. Best API: fetchVariantAnnotationPOST


 42%|████▏     | 5/12 [00:03<00:04,  1.56it/s]

Goal: Integrate mutation frequency data into the system.. Best API: fetchVariantAnnotationPOST


 50%|█████     | 6/12 [00:03<00:04,  1.50it/s]

Goal: Provide detailed gene function information.. Best API: No API Found


 58%|█████▊    | 7/12 [00:04<00:03,  1.57it/s]

Goal: Implement framework to assess clinical actionability of variants.. Best API: fetchVariantAnnotationPOST


 67%|██████▋   | 8/12 [00:05<00:02,  1.54it/s]

Goal: Support VCF format for variant annotation input.. Best API: fetchVariantAnnotationPOST


 75%|███████▌  | 9/12 [00:05<00:01,  1.57it/s]

Goal: Support BED format for variant annotation input.. Best API: No API Found


 83%|████████▎ | 10/12 [00:06<00:01,  1.54it/s]

Goal: Create user interface for browsing annotated genetic variants.. Best API: fetchVariantAnnotationPOST


 92%|█████████▏| 11/12 [00:07<00:00,  1.58it/s]

Goal: Implement support for human genome data analysis.. Best API: No API Found


100%|██████████| 12/12 [00:07<00:00,  1.54it/s]

Goal: Implement support for mouse genome data analysis.. Best API: No API Found



