In [None]:
def get_dir_path(s: str):
    return "keyword_analysis/%s.html" % s


proportions = dict(width=1600, height=1000)
xl_proportions = dict(width=2000, height=1400)

In [None]:

from pathlib import Path

import pandas as pd

# files = Path("metadata/keywords").glob("scverse.scanpy.1.10.2.*.csv")
files = Path("metadata/keywords").glob("*.csv")
df = []
for file in files:
    try:
        file_df = pd.read_csv(file)
        df.append(file_df)
    except:
        print(f"unable to read file {file}")

df = pd.concat(df)
df.source = df.source.apply(lambda x: x.split('.')[1])
df.head()

In [None]:
min_sentence_length = 50
df_good_sentences = df[df['sentence'].str.len() > min_sentence_length]
df_good_sentences.sample(15)

In [None]:
all_quality_attributes = df_good_sentences['quality_attribute'].unique().tolist()
all_quality_attributes

In [None]:
df_good_sentences.groupby(['source', 'quality_attribute']).size()

In [None]:
df_good_sentences.pivot_table('sentence', 'source', 'quality_attribute', 'count')

In [None]:
df_good_sentences.groupby(['source', 'quality_attribute']).get_group(('WIKI', 'Usability'))

In [None]:
quality_attribs = {
    "Availability": {
        "desc": "The system's readiness to perform its function when required, focusing on reliability and recovery. It involves fault masking or repair to prevent failures, ensuring minimal cumulative downtime."
    },
    "Deployability": {
        "desc": "The capability of software to be deployed into an operational environment with predictable time and effort, including options for rollback if needed. Key aspects include automation, deployment speed, and deployment granularity."
    },
    "Energy Efficiency": {
        "desc": "The system’s ability to optimize resource use and minimize energy consumption while achieving required performance. This involves monitoring, allocation, and adaptation of resources."
    },
    "Integrability": {
        "desc": "The ease of combining the system with other systems or components, measured by integration cost and technical risks. Integrability considers the complexity and compatibility of interfaces, including syntactic, semantic, behavioral, and temporal alignment."
    },
    "Modifiability": {
        "desc": "The ease with which the system can be adapted by adding, removing, or modifying features, or adjusting to new environments. This attribute involves assessing the time, cost, and impact of changes, considering factors like coupling, cohesion, and the scope of modifications."
    },
    "Performance": {
        "desc": "The system’s capacity to meet its timing requirements, managing event handling and response times effectively. Performance focuses on reducing blocked time from resource contention and optimizing resource utilization under varying load conditions."
    },
    "Safety": {
        "desc": "The system’s ability to avoid states that could lead to harm or damage. Safety encompasses detection and handling of errors (e.g., omissions, timing, incorrect values) to prevent hazardous outcomes or mitigate potential damage."
    },
    "Security": {
        "desc": "The system’s ability to safeguard information against unauthorized access, while permitting authorized access. Security emphasizes confidentiality, integrity, and availability, using tactics to detect, prevent, and respond to attacks."
    },
    "Testability": {
        "desc": "The ease of validating software functionality through testing, enabling fault detection. This includes controlling and observing the system’s state, reducing complexity, and facilitating the creation of test cases and oracles."
    },
    "Usability": {
        "desc": "The degree to which users can effectively and efficiently accomplish tasks, including support for error recovery and user satisfaction. Usability covers ease of learning, efficient usage, and adaptability to user needs."
    }
}

In [None]:
df_good_sentences.groupby(['source', 'quality_attribute']).size().reset_index()

In [None]:
sample_size = 15
df_sample = df_good_sentences.groupby(['source', 'quality_attribute']).apply(lambda x: x.sample(min(len(x), sample_size)),
                                                                 include_groups=False).reset_index()
df_sample["attribute_desc"] = df_sample["quality_attribute"].apply(lambda x: quality_attribs[x]["desc"])
df_sample

In [None]:
df_sample.columns.tolist()

In [None]:
to_prompt_with_keyword = lambda x: f"""
You are an expert in evaluating and categorizing quality attributes in software engineering. You possess the necessary skills to distinguish sentences that clearly relate to a given quality attribute from those that do not. 

Evaluate whether the matched keyword accurately aligns with its associated quality attribute, given the context provided. Your goal is to determine if the match makes sense in relation to the quality attribute description or if it is a false positive.

Data:

Quality Attribute: {x['quality_attribute']}
Attribute Description: {x['attribute_desc']}
Matched Word: {x['matched_word']}
Sentence: {x['sentence']}
Source: {x['source']}
Keyword: {x['keyword']}

Instructions: 
1. Analyze the sentence, the matched word, and the attribute description.
2. Determine if the matched word accurately reflects the intended quality attribute in this context.
3. If it does, label it as an accurate match; if not, mark it as a false positive.

Output your response as a JSON object in the following format:
{{
  "false_positive": <boolean>,
  "reasoning": "<str>"
}}
"""


In [None]:
to_prompt = lambda x: f"""
You are an expert in evaluating and categorizing quality attributes in software engineering. You possess the necessary skills to distinguish sentences that clearly relate to a given quality attribute from those that do not. 

Evaluate whether the content accurately aligns with its associated quality attribute, given the context provided. Your goal is to determine if the content makes sense in relation to the quality attribute description or if it is a false positive.

Data:

Quality Attribute: {x['quality_attribute']}
Attribute Description: {x['attribute_desc']}
Content: {x['sentence']}

Instructions: 
1. Analyze the content and the attribute description.
2. Determine if the content accurately reflects the intended quality attribute in this context.
3. If it does, label it as an accurate match; if not, mark it as a false positive.
4. Output only the JSON object in response, without any additional explanation.
5. Ensure the JSON output is properly formatted. Escape any special characters or inner quotes in strings to ensure compatibility with JSON parsers. Within JSON strings use \\\" to escape double quotes.


Output your response as a JSON object in the following format:
{{
  "false_positive": <boolean>,
  "reasoning": "<str>"
}}
"""


In [None]:
df_sample['prompt'] = df_sample.apply(lambda x: to_prompt(x), axis=1)
df_sample

In [None]:
df_sample.to_csv("./metadata/keywords/verification/big_sample2.csv", index=False)

In [None]:
df_sample["prompt"]

In [None]:
df_answers = pd.read_csv("./metadata/keywords/verification/sample_with_responses.csv")
df_answers.sort_values(["false_positive"])

In [None]:
df_answers = df_answers[df_answers['false_positive'] == True]
df_answers.groupby(["source", "quality_attribute"]).size()

In [None]:
df_answers = pd.read_csv("./metadata/keywords/verification/sample_with_responses_ailab.csv")
df_answers.sort_values(["false_positive"])

In [None]:
df_answers = pd.read_csv("./metadata/keywords/verification/sample_with_responses_ailab_2.csv")
df_answers.sort_values(["false_positive"])

In [None]:
df_answers = pd.read_csv("./metadata/keywords/verification/sample_with_responses_ailab_3.csv")
df_answers.sort_values(["false_positive"])

In [None]:
csv = Path("./metadata/keywords/verification/big_sample_with_responses.csv")
analysis_folder = csv.parent / "analysis"
df_big_sample = pd.read_csv(csv)
df_answers = df_big_sample.groupby(["source", "quality_attribute", "false_positive", "keyword", "matched_word"]).size().reset_index(name="count")
import plotly.express as px

fig = px.sunburst(df_answers, path=["false_positive", "quality_attribute", "source", "keyword", "matched_word"], values="count")
fig.update_layout(**proportions)
fig.show()
fig.write_html(analysis_folder / "big_sample_sunburst.html")

In [None]:
import numpy as np

keyword_rating_with_source = df_big_sample.groupby(["source", "keyword"]).agg(
    false_positive_False=("false_positive", lambda x: (x == False).sum()),
    false_positive_True=("false_positive", lambda x: (x == True).sum()),
    total=("keyword", "size")
).reset_index()

keyword_rating_with_source["score"] = keyword_rating_with_source.apply(
    lambda x: (x["false_positive_False"] - x["false_positive_True"]) / (x["false_positive_False"] + x["false_positive_True"]),
    axis=1
)
keyword_rating_with_source["adv_score"] = keyword_rating_with_source.apply(
    lambda x: (x["false_positive_False"] - x["false_positive_True"]) / (x["false_positive_False"] + x["false_positive_True"] + 1) * (1 + np.log1p(x["false_positive_False"])),
    axis=1
)

keyword_rating_with_source["total_score"] = keyword_rating_with_source.apply(
    lambda x: x["false_positive_False"] * 5 - x["false_positive_True"] - abs(x["false_positive_False"] - x["false_positive_True"]),
    axis=1
)

# keyword_rating_with_source["adv_score"] = keyword_rating_with_source.apply(
#     lambda x: (diff := x["false_positive_False"] - x["false_positive_True"]) * abs(diff) / (x["false_positive_False"] + x["false_positive_True"] + 1) * x["total"],
#     axis=1
# )

keyword_rating_with_source = keyword_rating_with_source.sort_values(["total_score", "false_positive_False"], ascending=False)
keyword_rating_with_source.to_csv(analysis_folder / "keyword_rating_with_source.csv")
keyword_rating_with_source

In [None]:
keyword_rating = df_big_sample.groupby(["keyword"]).agg(
    false_positive_False=("false_positive", lambda x: (x == False).sum()),
    false_positive_True=("false_positive", lambda x: (x == True).sum()),
    total=("keyword", "size"),
).reset_index()

keyword_rating["score"] = keyword_rating.apply(
    lambda x: (x["false_positive_False"] - x["false_positive_True"]) / (x["false_positive_False"] + x["false_positive_True"]),
    axis=1
)

keyword_rating["adv_score"] = keyword_rating.apply(
    lambda x: (x["false_positive_False"] - x["false_positive_True"]) / (x["false_positive_False"] + x["false_positive_True"] + 1) * (1 + np.log1p(x["false_positive_False"])),
    axis=1
)

keyword_rating["total_score"] = keyword_rating.apply(
    lambda x: x["false_positive_False"] * 5 - x["false_positive_True"] - abs(x["false_positive_False"] - x["false_positive_True"]),
    axis=1
)

# keyword_rating["adv_score"] = keyword_rating.apply(
#     lambda x: (diff := x["false_positive_False"] - x["false_positive_True"]) * abs(diff) / (x["false_positive_False"] + x["false_positive_True"] + 1) * x["total"],
#     axis=1
# )

keyword_rating = keyword_rating.sort_values(["total_score", "false_positive_False"], ascending=False)
keyword_rating.to_csv(analysis_folder / "keyword_rating.csv")
keyword_rating

In [None]:
fig = px.bar(df_big_sample.groupby(["keyword", "false_positive"]).agg(size=("keyword", "size"), sources=("source", lambda x: ", ".join(x.unique().tolist()))).reset_index(), y="keyword", x="size", color="false_positive", hover_data="sources")
fig.update_yaxes(categoryorder='total ascending')
fig.update_layout(width=1600, height=2200)
fig.show()
fig.write_html(analysis_folder / "keyword_rating_stacked_bar.html")

In [None]:
fig = px.bar(df_big_sample.groupby(["keyword", "false_positive", "source"]).size().reset_index(name="size").sort_values(["size", "keyword", "false_positive", "source"], ascending=[1,0,0,0]), y="keyword", x="size", color="false_positive", facet_col="source")
fig.update_layout(width=1600, height=2200)
fig.show()
fig.write_html(analysis_folder / "keyword_rating_faceted_stacked_bar.html")