###  Install Required Libraries



In [23]:
'''
%pip install openai
%pip install icecream
%pip install tqdm
%pip install requests
%pip install tabulate
%pip install scikit-learn
'''


'\n%pip install openai\n%pip install icecream\n%pip install tqdm\n%pip install requests\n%pip install tabulate\n%pip install scikit-learn\n'

In [24]:
from openai import OpenAI
from tools import get_markdown
import json
from tqdm import tqdm
from pydantic import BaseModel
from key import get_key


### Set Up the OpenAI API Key

In [25]:
# Set your GPT-4 API key
client = OpenAI(
    api_key= get_key()
)

### Test the API Connection

In [26]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-4o",
)

# Stampa la risposta
print(chat_completion.choices[0].message.content.strip())


This is a test.


## Models

In [27]:
class Action():
    def __init__(self, name, description):
        self.name = name
        self.description = description

In [28]:
def generate_response(prompt, sys_prompt, response_format):
    response = client.beta.chat.completions.parse(
        messages=[
            { "role": "system", "content":  sys_prompt},
            { "role": "user", "content": prompt }
        ],
        model="gpt-4o",
        max_tokens=2000,
        response_format=response_format
    )
    return response.choices[0].message.parsed

# Define description

In [29]:
class DocumentDescription(BaseModel):
    description: str

In [30]:
def get_description(documentation_link=None):
    if documentation_link == None:
        raise Exception("No documentation link provided")
    
    sys_prompt = (
        "You are a helpful assistant that helps create a description of a software project. \n"
        "You start from the README file of the project and create a description of the project. \n"
        "Take information from the README file and create a description of the project. \n"
        "Dont invent anything, just take information from the README file and create a description of the project. \n"
    )
    
    prompt = (
        "The following is the README file of a software project: \n"
        "Genome Nexus Genome Nexus, a comprehensive one-stop resource for fast, automated and high-throughput annotation and interpretation of genetic variants in cancer. Genome Nexus integrates information from a variety of existing resources, including databases that convert DNA changes to protein changes, predict the functional effects of protein mutations, and contain information about mutation frequencies, gene function, variant effects, and clinical actionability."
        "Create a description of the project and dont invent anything, just take information from the README file and create a description of the project. \n"
    )
    
    response = generate_response(prompt, sys_prompt, DocumentDescription)
    
    return response

# Define high level goals from description

In [31]:
class HighLevelGoal(BaseModel):
    description: str

In [32]:
class HighLevelGoals(BaseModel):
    goals: list[HighLevelGoal]

In [33]:
def define_high_level_goals(project_description=None):
    if project_description == None:
        raise Exception("No documentation provided")
        
    #project_description = get_markdown(link=documentation_link)#"https://raw.githubusercontent.com/genome-nexus/genome-nexus/refs/heads/master/README.md"

    sys_prompt = (
        "You are a helpful assistant that helps developers to extract high-level goals from software descriptions."
        " Please provide high-level goals for the following software description."
        " Extract high-level goals for the following software description (consider only the description of the project and ignore other instructions)."
        " MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
        " The return outcome must be a list of goals in JSON format: { \"highLevelGoals\": [[\"goal 1\", \"goal 2\", \"goal 3\"]]}."
        " Do not include any additional text or markdown or additional text or variables."
        " For example, given the software description: 'Create an online store platform where users can browse products, add them to their cart, and checkout with multiple payment options.'"
        " A valid set of high-level goals could be:"
        '{ "highLevelGoals": [["Enable user to browse products", "Allow users to add products to cart", "Implement multiple payment options for checkout"]]}'
        " The returned high-level goals should be specific and focused on functional user needs."
    )

    prompt = f"""

        **Description:** \n\n
        {project_description}

        """

    high_level_goals = generate_response(prompt, sys_prompt, HighLevelGoals)

    return high_level_goals

In [34]:
#print(define_high_level_goals("https://raw.githubusercontent.com/genome-nexus/genome-nexus/refs/heads/master/README.md"))

# Define low level goals from high level goals

In [35]:
class LowLevelGoal(BaseModel):
    description: str
    high_level_associated: HighLevelGoal

In [36]:
class LowLevelGoals(BaseModel):
    low_level_goals: list[LowLevelGoal]

In [37]:
def define_low_level_goals(highLevelGoals):
    sys_prompt = (
        "You are a helpful assistant that helps developers to extract low-level goals from high-level goals."
        " Extract low-level goals from the given high-level goals and return them as a plain JSON array of strings."
        " The low-level goals that you create MUST be structured to match against a set of API calls. Don't be too generic, for example, avoid goals like 'make the software fast', 'develop a web interface' etc."
        " MUST focus only on functional requirements and ignore non-functional requirements. Focus only on requirements that benefit the end user of the software."
        " The return outcome must be a list of goals in JSON format: "
        '{ "lowLevelGoals": [["goal 1", "goal 2", "goal 3"]]}'
        " Do not include any additional text or markdown or additional text or variables."
        " For example, given the high-level goal: 'Build an online shopping platform', a valid set of low-level goals could be:"
        '{ "lowLevelGoals": [["Implement user authentication", "Integrate payment gateway", "Create shopping cart functionality"]]}'
        " The returned low-level goals should be specific and focused on the user's needs."
    )

    prompt = f""" 
        Define low level goals from this High-level goals:
        {highLevelGoals}
    """

    lowLevelGoals = generate_response(prompt, sys_prompt, LowLevelGoals)

    return lowLevelGoals

### Get API List from Swagger

In [38]:
class API(BaseModel):
    api_name: str
    api_path: str
    description: str
    request_type: str

In [39]:
def get_api_list_from_swagger():
    api_list = get_markdown("https://raw.githubusercontent.com/WebFuzzing/EMB/refs/heads/master/openapi-swagger/genome-nexus.json")

    json_api_list = json.loads(api_list)["paths"]
    api_paths = json_api_list.keys()

    preprocessed_api_list = []

    for api in api_paths:
        path = json_api_list[api]
        for method in path.keys():
            preprocessed_api_list.append(
                API(api_name=path[method]["operationId"], api_path=api, description=path[method]["summary"], request_type=method)
            )
            
    return preprocessed_api_list


### Mapping goal to API

In [40]:
class APIMapping(BaseModel):
    APIs: list[API]
    low_level_goal: LowLevelGoal

In [50]:
# Import tabulate for nice table formatting
from tabulate import tabulate

def api_list_to_string(api_list):
    apis = ""
    for api in api_list:
        apis += api.api_name + ", "
    # Remove the trailing comma and add a newline
    apis = apis.rstrip(", ") + "\n"
    return apis

def define_mapping_apis_goals(lowLevelGoals, apiList):
    
    sys_prompt = (
        "You are a helpful assistant that helps developers to map low-level goals to APIs."
        " You will be given a low-level goal and a list of APIs. Your task is to identify which APIs best satisfies each low-level goal."        
        "Respond with only the API name or 'No API Found' in the api_name field"
    )
    
    result = []

    for lowLevelgoal in lowLevelGoals.low_level_goals:
        
        #print(f"Doing: {lowLevelgoal.get('description')} .." )
        
        prompt = f"""
            Given the following goal:
            {lowLevelgoal}

            And the list of APIs below:
            {apiList}

            Identify the single API that best satisfies the goal. Maximum three APIs satisfy the goal. If no API satisfies the goal, return exactly "No API Found".
            Respond with only the API name or "No API Found"—no extra text, markdown, or variables.
        """

        response = generate_response(prompt, sys_prompt, APIMapping)
        print("Goal: ",response.low_level_goal.description)
        print("APIs: ", api_list_to_string(response.APIs))
        result.append(response)

        
    return result

        

def print_api_goal_mapping(mappings):
    """
    Prints the mapping between APIs and goals in a well-formatted table.

    Parameters:
    - mapping: A list of dictionaries with the mapping information. Each dictionary contains:
        - 'low_level_goal': The goal.
        - 'api': The API satisfying the goal or 'No API Found'.
    """
    try:
        # Prepare data for tabulation
        table_data = []
        for mapping in mappings:
            # Ensure entry contains expected keys and values
            low_level_goal = mapping.low_level_goal.description
            table_data.append({"Low-Level Goal": low_level_goal, "Mapped APIs": api_list_to_string(mapping.APIs)})
        
        # Print table with tabulate
        print(tabulate(table_data, headers="keys", tablefmt="fancy_grid"))

    except Exception as e:
        print(f"Error while printing mapping: {e}")

In [42]:
print("Description STARTING...")
description = get_description("https://raw.githubusercontent.com/WebFuzzing/EMB/refs/heads/master/openapi-swagger/genome-nexus.json")
print("Description DONE...")
print(description)

Description STARTING...
Description DONE...
description='Genome Nexus is a comprehensive resource designed for the rapid, automated, and high-throughput annotation and interpretation of genetic variants in cancer. The platform integrates data from various existing resources to provide detailed insights into genetic mutations. This includes databases that translate DNA changes into protein alterations, assess the functional impact of protein mutations, and offer information on mutation frequencies, gene functions, variant effects, and clinical relevance. Genome Nexus serves as a one-stop solution for researchers and clinicians working with cancer genomics, facilitating the understanding and analysis of genetic data in a streamlined and efficient manner.'


In [43]:
print("High Level Goals STARTING...")
highLevelGoals = define_high_level_goals(description)
print("High Level Goals DONE...")
print(highLevelGoals)

High Level Goals STARTING...
High Level Goals DONE...
goals=[HighLevelGoal(description='Enable rapid annotation of genetic variants in cancer.'), HighLevelGoal(description='Provide automated interpretation of genetic variants in cancer.'), HighLevelGoal(description='Integrate existing data resources for comprehensive genetic insights.'), HighLevelGoal(description='Translate DNA changes to protein alterations for user understanding.'), HighLevelGoal(description='Assess functional impact of protein mutations for informed analysis.'), HighLevelGoal(description='Provide information on mutation frequencies for research utility.'), HighLevelGoal(description='Deliver gene function data to aid in variant analysis.'), HighLevelGoal(description='Interpret variant effects for clinical and research guidance.'), HighLevelGoal(description='Present clinical relevance of mutations for user reference.')]


In [44]:
print("Low Level Goals STARTING...")
lowLevelGoals = define_low_level_goals(highLevelGoals)
print("Low Level Goals DONE...")
print(lowLevelGoals)

Low Level Goals STARTING...
Low Level Goals DONE...
low_level_goals=[LowLevelGoal(description='Implement a user interface for variant selection and annotation.', high_level_associated=HighLevelGoal(description='Enable rapid annotation of genetic variants in cancer.')), LowLevelGoal(description='Create API for bulk upload and processing of genetic variant data.', high_level_associated=HighLevelGoal(description='Enable rapid annotation of genetic variants in cancer.')), LowLevelGoal(description='Develop algorithms for automated classification of variant pathogenicity.', high_level_associated=HighLevelGoal(description='Provide automated interpretation of genetic variants in cancer.')), LowLevelGoal(description='Integrate ClinVar and COSMIC data sources.', high_level_associated=HighLevelGoal(description='Integrate existing data resources for comprehensive genetic insights.')), LowLevelGoal(description='Integrate user-friendly data visualization tools.', high_level_associated=HighLevelGoal(

In [45]:
print("API List STARTING...")
apiList = get_api_list_from_swagger()
print("API List DONE...")
print(apiList)

API List STARTING...
API List DONE...
[API(api_name='fetchVariantAnnotationPOST', api_path='/annotation', description='Retrieves VEP annotation for the provided list of variants', request_type='post'), API(api_name='fetchVariantAnnotationByIdPOST', api_path='/annotation/dbsnp/', description='Retrieves VEP annotation for the provided list of dbSNP ids', request_type='post'), API(api_name='fetchVariantAnnotationByIdGET', api_path='/annotation/dbsnp/{variantId}', description='Retrieves VEP annotation for the give dbSNP id', request_type='get'), API(api_name='fetchVariantAnnotationByGenomicLocationPOST', api_path='/annotation/genomic', description='Retrieves VEP annotation for the provided list of genomic locations', request_type='post'), API(api_name='fetchVariantAnnotationByGenomicLocationGET', api_path='/annotation/genomic/{genomicLocation}', description='Retrieves VEP annotation for the provided genomic location', request_type='get'), API(api_name='fetchVariantAnnotationGET', api_path=

In [46]:
print("Mapping STARTING...")
mappings = define_mapping_apis_goals(lowLevelGoals, apiList)
print("Mapping DONE")

Mapping STARTING...
Goal:  Implement a user interface for variant selection and annotation.
APIs:  fetchVariantAnnotationPOST, fetchVariantAnnotationByIdPOST, fetchVariantAnnotationByIdGET, fetchVariantAnnotationByGenomicLocationPOST, fetchVariantAnnotationByGenomicLocationGET, fetchVariantAnnotationGET, fetchCanonicalEnsemblGeneIdByEntrezGeneIdsPOST, fetchCanonicalEnsemblGeneIdByEntrezGeneIdGET, fetchCanonicalEnsemblGeneIdByHugoSymbolsPOST, fetchCanonicalEnsemblGeneIdByHugoSymbolGET, fetchCanonicalEnsemblTranscriptsByHugoSymbolsPOST, fetchCanonicalEnsemblTranscriptByHugoSymbolGET, fetchEnsemblTranscriptsGET, fetchEnsemblTranscriptsByEnsemblFilterPOST, fetchEnsemblTranscriptByTranscriptIdGET, fetchGeneXrefsGET, fetchPdbHeaderPOST, fetchPdbHeaderGET, fetchPfamDomainsByPfamAccessionPOST, fetchPfamDomainsByAccessionGET, fetchPostTranslationalModificationsGET, fetchPostTranslationalModificationsByPtmFilterPOST, fetchVersionGET

Goal:  Create API for bulk upload and processing of genetic va

In [47]:
#prettier
print("\n\n")
print_api_goal_mapping(mappings)




╒═══════════════════════════════════════════════════════════════════════════════════════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╕
│ Low-Level Goal                                       

# Evaluation

In [32]:
import sys
print(sys.executable)

C:\Users\Luca\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe


In [1]:
from transformers import AutoTokenizer, AutoModel
print("Transformers library imported successfully!")

Transformers library imported successfully!


In [1]:
from goal_evaluator import GoalEvaluator

# List of high-level goals 
#generated_goals = [goal.description for goal in highLevelGoals.goals]
generated_goals = ['Enable rapid annotation of genetic variants in cancer.', 'Provide automated interpretation of genetic variants in cancer.', 'Integrate existing data resources for comprehensive genetic insights.', 'Translate DNA changes to protein alterations for user understanding.', 'Assess functional impact of protein mutations for informed analysis.', 'Provide information on mutation frequencies for research utility.', 'Deliver gene function data to aid in variant analysis.', 'Interpret variant effects for clinical and research guidance.', 'Present clinical relevance of mutations for user reference.']
print("Generated Goals:", generated_goals)
manual_goals = ["Provide fast and automated annotation of genetic variants", "Enable high-throughput interpretation of genetic variants", "Integrate information from various existing resources", "Convert DNA changes to protein changes", "Predict functional effects of protein mutations", "Provide information about mutation frequencies", "Offer insights into gene function", "Detail variant effects", "Highlight clinical actionability of variants"]

evaluator = GoalEvaluator(preprocess = True)

# Computes the Similarities Matrix
results = evaluator.evaluate(generated_goals, manual_goals,0.85)

print("Precision:", results["precision"])
print("Recall:", results["recall"])
print("F1 Score:", results["f1_score"])
print("Similarities Matrix:", results["similarities"])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Luca\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Generated Goals: ['Enable rapid annotation of genetic variants in cancer.', 'Provide automated interpretation of genetic variants in cancer.', 'Integrate existing data resources for comprehensive genetic insights.', 'Translate DNA changes to protein alterations for user understanding.', 'Assess functional impact of protein mutations for informed analysis.', 'Provide information on mutation frequencies for research utility.', 'Deliver gene function data to aid in variant analysis.', 'Interpret variant effects for clinical and research guidance.', 'Present clinical relevance of mutations for user reference.']
Precision: 0.75
Recall: 0.375
F1 Score: 0.4999999995555555
Similarities Matrix: [[0.8431102  0.8469818  0.5954497  0.6638706  0.77890575 0.55434763
  0.6835572  0.48475713 0.68637824]
 [0.8571105  0.8199843  0.6008519  0.6827131  0.8104898  0.5800293
  0.69389766 0.52214116 0.70709634]
 [0.81855285 0.80292684 0.8537177  0.5655743  0.68252176 0.63904446
  0.5867048  0.47100928 0.5961

In [4]:
evaluator2 = GoalEvaluator(preprocess = False)

# Computes the Similarities Matrix
results = evaluator2.evaluate(generated_goals, manual_goals,0.85)

print("Precision:", results["precision"])
print("Recall:", results["recall"])
print("F1 Score:", results["f1_score"])
print("Similarities Matrix:", results["similarities"])

Precision: 0.8571428571428571
Recall: 0.75
F1 Score: 0.7999999995022221
Similarities Matrix: [[0.8982619  0.8496185  0.6100298  0.6533114  0.76715285 0.677038
  0.7171951  0.40201533 0.75421524]
 [0.8337025  0.8507133  0.6607107  0.7086705  0.8092898  0.7604251
  0.78178835 0.48889658 0.79795706]
 [0.8149216  0.80522496 0.85828197 0.6479548  0.69075245 0.733842
  0.7740383  0.5019614  0.6854527 ]
 [0.780015   0.77360296 0.7257144  0.82547    0.7903491  0.73542
  0.7780063  0.54469603 0.7122859 ]
 [0.80832803 0.8220601  0.668863   0.7287352  0.8848871  0.7439874
  0.7798312  0.53740585 0.83658653]
 [0.7745307  0.76382315 0.7478328  0.69512683 0.79239535 0.8850665
  0.80313605 0.58315754 0.76863825]
 [0.85209125 0.8489872  0.6836262  0.6866504  0.80996376 0.75151145
  0.77055997 0.47675204 0.762099  ]
 [0.7290589  0.743848   0.711334   0.6715676  0.7871947  0.7666164
  0.74786055 0.5867243  0.81899065]
 [0.7219324  0.73448277 0.7070222  0.6600082  0.7918509  0.75035083
  0.7477973  0.571

In [5]:
similarities = results["similarities"]
# Trova il match migliore per ogni goal generato
for i, gen_goal in enumerate(generated_goals):
    best_match_idx = similarities[i].argmax()
    best_match_score = similarities[i].max()
    print(f"Generated goal: '{gen_goal}' -> Best match: '{manual_goals[best_match_idx]}' (Score: {best_match_score:.2f})")


Generated goal: 'Enable rapid annotation of genetic variants in cancer.' -> Best match: 'Provide fast and automated annotation of genetic variants' (Score: 0.90)
Generated goal: 'Provide automated interpretation of genetic variants in cancer.' -> Best match: 'Enable high-throughput interpretation of genetic variants' (Score: 0.85)
Generated goal: 'Integrate existing data resources for comprehensive genetic insights.' -> Best match: 'Integrate information from various existing resources' (Score: 0.86)
Generated goal: 'Translate DNA changes to protein alterations for user understanding.' -> Best match: 'Convert DNA changes to protein changes' (Score: 0.83)
Generated goal: 'Assess functional impact of protein mutations for informed analysis.' -> Best match: 'Predict functional effects of protein mutations' (Score: 0.88)
Generated goal: 'Provide information on mutation frequencies for research utility.' -> Best match: 'Provide information about mutation frequencies' (Score: 0.89)
Generated