In [None]:
def get_dir_path(s: str):
    return "keyword_analysis/%s.html" % s


proportions = dict(width=1600, height=1000)
xl_proportions = dict(width=2000, height=1400)

In [None]:

from pathlib import Path

import pandas as pd

files = Path("metadata/keywords").glob("scverse.scanpy.1.10.2.*.csv")
df = []
for file in files:
    try:
        file_df = pd.read_csv(file)
        df.append(file_df)
    except:
        print(f"unable to read file {file}")

df = pd.concat(df)
df.source = df.source.apply(lambda x: x.split('.')[1])
df.head()

In [None]:
min_sentence_length = 50
df_good_sentences = df[df['sentence'].str.len() > min_sentence_length]
df_good_sentences.sample(15)

In [None]:
df_good_sentences.groupby(['source', 'quality_attribute']).size()

In [None]:
quality_attribs = {
    "Availability": {
        "desc": "The ability of a system to be ready to carry out its task when needed, encompassing reliability and recovery. It involves masking or repairing faults to prevent failures and ensuring cumulative service outage doesn't exceed a specified value. It's measured by MTBF/(MTBF + MTTR)."},
    "Deployability": {
        "desc": "The ability of software to be deployed to an environment for execution within predictable time and effort, including rollback if needed.  Focuses on automation, efficiency, and granularity of deployment."},
    "Energy Efficiency": {
        "desc": "The system's ability to effectively utilize resources and minimize energy consumption while delivering required functionality. This involves resource monitoring, allocation, and adaptation, often balancing energy usage with performance, availability, and other quality attributes."},
    "Integrability": {
        "desc": "The ease with which a system can be integrated with other systems or components, measured by the cost and technical risks associated with integration. It depends on the size and 'distance' of interfaces, including syntactic, semantic, behavioral, temporal, and resource differences."},
    "Modifiability": {
        "desc": "The ease with which a system can be changed to add/delete/modify functionality, improve quality attributes, or adapt to new environments.  It considers the likelihood, time, cost, and impact of changes on coupling, cohesion, size, and binding time."},
    "Performance": {
        "desc": "The system's ability to meet timing requirements, handling events and responding in a timely manner.  It involves managing processing time, blocked time due to resource contention, and optimizing resource usage under various load conditions."},
    "Safety": {
        "desc": "The system's ability to avoid unsafe states that could cause harm. This involves recognizing and responding to omissions, commissions, timing errors, incorrect values, and sequence errors to prevent or mitigate damage and injury."},
    "Security": {
        "desc": "The system's ability to protect data and information from unauthorized access while allowing access to authorized users. It focuses on confidentiality, integrity, and availability, using tactics to detect, resist, react to, and recover from attacks."},
    "Testability": {
        "desc": "The ease with which software can be made to demonstrate faults through testing. This involves controlling and observing system state, limiting complexity, and facilitating the creation of test harnesses and oracles."},
    "Usability": {
        "desc": "How easy it is for the user to accomplish desired tasks and the kind of user support provided. It covers learning, efficient use, error minimization, adaptation to user needs, and increasing user confidence and satisfaction."},
}


In [None]:
df_good_sentences = df_good_sentences.groupby(['source', 'quality_attribute']).apply(lambda x: x.sample(min(len(x), 3)),
                                                                 include_groups=False).reset_index()
df_good_sentences["attribute_desc"] = df_good_sentences["quality_attribute"].apply(lambda x: quality_attribs[x]["desc"])
df_good_sentences

In [None]:

df_good_sentences.columns.tolist()

In [None]:
to_prompt = lambda x: f"""
Evaluate whether the matched keyword accurately aligns with its associated quality attribute, based on the context provided. The goal is to determine if the match makes sense in relation to the quality attribute description, or if it is a false-positive.

Data:

Source: {x['source']}
Quality Attribute: {x['quality_attribute']}
Keyword: {x['keyword']}
Matched Word: {x['matched_word']}
Sentence: "{x['sentence']}"
Filename: {x['filename']}
Author: {x['author']}
Repository: {x['repo']}
Version: {x['version']}
Attribute Description: "{x['attribute_desc']}"
Instructions: Based on the sentence, the matched word, and the attribute description, determine if the match is appropriate. Respond with Good if the matched word correctly reflects the intended quality attribute, or False-Positive if it doesn’t make sense in the given context.
"""

In [None]:
df_good_sentences['prompt'] = df_good_sentences.apply(lambda x: to_prompt(x), axis=1)
df_good_sentences