In [1]:
import pandas as pd
from scipy.stats import bernoulli
from sklearn.metrics import (
    balanced_accuracy_score,
    matthews_corrcoef,
    recall_score,
    precision_score,
    confusion_matrix,
    cohen_kappa_score,
)
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np
from IPython.display import display
from warnings import filterwarnings
filterwarnings("ignore")
from analysis_helper import (
    calculate_metrics_for_classic_prompts,
    calculate_fleiss_kappa_for_group,
    calculate_avg_decision,
    calculate_expected_value
)

In [2]:
df = pd.read_csv("final_gpt4_data_parsed_2.csv")
df["Decision_heatmap"] = df['Decision']
df["Decision"] = df['ParsedDecision_U2']
df.drop(columns=['ParsedDecision_U2'], inplace=True)
df.Decision = df.Decision.apply(lambda x: x.split("_")[1] if "MAYBE" in x else x)
# df = df[~df.Decision.str.contains("MAYBE")]
print(df.shape)
df.set_index("Article", inplace=True)
df.sort_index(inplace=True)
display(df.head())
# Filter rows based on "Prompt" column
# simple = df.loc[df["Prompt"].str.contains("SH0-RQy-SIMPLE")]
selection = df.loc[df["Prompt"].str.contains("SH0-RQy-SELECTION")]
cot = df.loc[df["Prompt"].str.contains("SH0-RQy-COT")]
# print(simple.shape, selection.shape, cot.shape)
print( selection.shape, cot.shape)

(20578, 8)


Unnamed: 0_level_0,Project,Prompt,Dataset,Confidence,Decision,is_true,Decision_heatmap
Article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
001d34e1-3344-457d-8f91-810d0293109c,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-SELECTION-...,Std-U2-Cy-SH0-RQy-SELECTION,ESPLE,8.0,EXCLUDE,True,TN
001d34e1-3344-457d-8f91-810d0293109c,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-COT-ESPLE-...,Std-U2-Cy-SH0-RQy-COT,ESPLE,8.0,EXCLUDE,True,TN
00212ea4-fc6f-4c5c-8269-345fcf9b74df,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-COT-SMELLR...,Std-U2-Cy-SH0-RQy-COT,SMELLREPROD,9.0,EXCLUDE,True,TN
00212ea4-fc6f-4c5c-8269-345fcf9b74df,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-SELECTION-...,Std-U2-Cy-SH0-RQy-SELECTION,SMELLREPROD,8.0,EXCLUDE,True,TN
003006a3-56cd-45bc-84aa-d1fc9cbfb817,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-COT-LC-FIN...,Std-U2-Cy-SH0-RQy-COT,LC,8.0,EXCLUDE,True,TN


(10289, 7) (10289, 7)


In [3]:
# print(np.unique(simple.index == selection.index), (simple.index == selection.index).sum())
# print(np.unique(simple.index == cot.index), (simple.index == cot.index).sum())
print(np.unique(cot.index == selection.index), (cot.index == selection.index).sum())

[ True] 10289


In [4]:
# Convert common_articles set to a list before creating final DataFrame
final_df = pd.DataFrame(index=list(selection.index))

# Merge Decisions for each Prompt into the final DataFrame
# final_df = final_df.merge(
#     simple[["Decision", "is_true", "Confidence"]].rename(
#         columns={
#             "Decision": "SIMPLE_Decision",
#             "is_true": "SIMPLE_is_true",
#             "Confidence": "SIMPLE_Confidence",
#         }
#     ),
#     how="left",
#     left_index=True,
#     right_index=True,
# )

final_df = final_df.merge(
    selection[["Decision", "is_true", "Confidence"]].rename(
        columns={
            "Decision": "SELECTION_Decision",
            "is_true": "SELECTION_is_true",
            "Confidence": "SELECTION_Confidence",
        }
    ),
    how="left",
    left_index=True,
    right_index=True,
)

final_df = final_df.merge(
    cot[["Decision", "is_true", "Confidence"]].rename(
        columns={
            "Decision": "COT_Decision",
            "is_true": "COT_is_true",
            "Confidence": "COT_Confidence",
        }
    ),
    how="left",
    left_index=True,
    right_index=True,
)

# Map the "Dataset" column to the common articles
article_to_dataset = df["Dataset"].to_dict()
final_df["Dataset"] = final_df.index.map(article_to_dataset)

# Sort the final DataFrame by index (optional)
final_df.sort_index(inplace=True)
display(final_df.head())


Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset
001d34e1-3344-457d-8f91-810d0293109c,EXCLUDE,True,8.0,EXCLUDE,True,8.0,ESPLE
00212ea4-fc6f-4c5c-8269-345fcf9b74df,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPROD
003006a3-56cd-45bc-84aa-d1fc9cbfb817,EXCLUDE,True,10.0,EXCLUDE,True,8.0,LC
0032cdd7-a488-478e-8010-0ffe777e5137,EXCLUDE,False,8.0,EXCLUDE,True,8.0,SMELLREPROD
00348ad9-edf3-4f1d-b4e5-105c1e51cdd2,EXCLUDE,True,8.0,EXCLUDE,True,10.0,SMELLREPROD


In [5]:
%cd ..


/Users/gauransh/Code/PromptSLR


In [6]:
!pwd

/Users/gauransh/Code/PromptSLR


In [7]:
from utils.db_connector import DBConnector

db = DBConnector()

# Initialize the 'screened_decision' column with None
final_df["screened_decision"] = None

# Get the list of articles based on the index of final_df
articles = db.get_articles_by_articlekey(final_df.index.tolist())  # Use the index

print(len(articles))

# Loop through the articles and update 'screened_decision' in final_df
for article in articles:
    # Use the index directly for comparison
    if article.Key in final_df.index:
        final_df.loc[article.Key, "screened_decision"] = article.ScreenedDecision[
            :-1
        ].upper()

display(final_df.head())

10289


Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset,screened_decision
001d34e1-3344-457d-8f91-810d0293109c,EXCLUDE,True,8.0,EXCLUDE,True,8.0,ESPLE,EXCLUDE
00212ea4-fc6f-4c5c-8269-345fcf9b74df,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPROD,EXCLUDE
003006a3-56cd-45bc-84aa-d1fc9cbfb817,EXCLUDE,True,10.0,EXCLUDE,True,8.0,LC,EXCLUDE
0032cdd7-a488-478e-8010-0ffe777e5137,EXCLUDE,False,8.0,EXCLUDE,True,8.0,SMELLREPROD,EXCLUDE
00348ad9-edf3-4f1d-b4e5-105c1e51cdd2,EXCLUDE,True,8.0,EXCLUDE,True,10.0,SMELLREPROD,EXCLUDE


In [8]:
# Initialize an empty dictionary to store results
results = []

datasets = df["Dataset"].unique()  # Get unique datasets

for dataset in datasets:
    # Initialize dictionary for current dataset results
    dataset_results = {"Dataset": dataset}

    # Filter subsets of decisions for the current dataset
    # simple_subset = simple[simple["Dataset"] == dataset]["Decision"]
    selection_subset = selection[selection["Dataset"] == dataset]["Decision"]
    cot_subset = cot[cot["Dataset"] == dataset]["Decision"]

    # # # Calculate Cohen's Kappa for Simple/Selection
    # if len(simple_subset) > 1 and len(selection_subset) > 1:  # Ensure valid input sizes
    #     dataset_results["Cohen_Simple_Selection"] = cohen_kappa_score(
    #         simple_subset, selection_subset
    #     )
    # else:
    #     dataset_results["Cohen_Simple_Selection"] = None

    # # Calculate Cohen's Kappa for Simple/COT
    # if len(simple_subset) > 1 and len(cot_subset) > 1:  # Ensure valid input sizes
    #     dataset_results["Cohen_Simple_COT"] = cohen_kappa_score(
    #         simple_subset, cot_subset
    #     )
    # else:
    #     dataset_results["Cohen_Simple_COT"] = None

    # Calculate Cohen's Kappa for Selection/COT
    if len(selection_subset) > 1 and len(cot_subset) > 1:  # Ensure valid input sizes
        dataset_results["Cohen_Selection_COT"] = cohen_kappa_score(
            selection_subset, cot_subset
        )
    else:
        dataset_results["Cohen_Selection_COT"] = None

    # Combine all subsets into a single DataFrame for Fleiss' Kappa
    combined = pd.concat(
        [
            # simple_subset.reset_index(drop=True),
            selection_subset.reset_index(drop=True),
            cot_subset.reset_index(drop=True),
        ],
        axis=1,
    )
    combined.columns = [ "SELECTION", "COT"]

    # Drop rows with missing values (if any)
    combined = combined.dropna()

    # Calculate Fleiss' Kappa if there are enough rows
    if len(combined) > 1:
        kappa = calculate_fleiss_kappa_for_group(combined)
        dataset_results["Fleiss_Kappa"] = kappa
    else:
        dataset_results["Fleiss_Kappa"] = None

    # Append the results for the dataset
    results.append(dataset_results)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
display(results_df)

Unnamed: 0,Dataset,Cohen_Selection_COT,Fleiss_Kappa
0,ESPLE,0.325606,0.316762
1,SMELLREPROD,0.428741,0.424988
2,LC,0.801675,0.800245
3,GAMESEFINAL,0.801061,0.799692
4,UPDATECOLLABMDE,0.845644,0.845558
5,ESM2,0.51537,0.514655
6,DTCPS,0.731182,0.729315
7,SECSELFADAPT,0.3226,0.31429
8,TRUSTSE,0.198646,0.044839
9,BEHAVE,0.843241,0.842374


### Check Expected value of Burnoulie's Random Variable

In [9]:
# Add Expected_Value column
final_df["Expected_Value"] = final_df.apply(calculate_expected_value, axis=1)

# Add Expected_Decision column based on Expected_Value
final_df["Expected_Decision"] = final_df["Expected_Value"].apply(
    lambda x: "INCLUDE" if x > 0.5 else "EXCLUDE"
)
final_df["Expected_is_true"] = final_df["Expected_Decision"] == final_df["screened_decision"]
# Add Avg_Decision column based on majority decision logic
final_df["Avg_Decision"] = final_df.apply(calculate_avg_decision, axis=1)
final_df["Avg_is_true"] = final_df["Avg_Decision"] == final_df["screened_decision"]
# Display the head of the DataFrame
final_df.head()

Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset,screened_decision,Expected_Value,Expected_Decision,Expected_is_true,Avg_Decision,Avg_is_true
001d34e1-3344-457d-8f91-810d0293109c,EXCLUDE,True,8.0,EXCLUDE,True,8.0,ESPLE,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
00212ea4-fc6f-4c5c-8269-345fcf9b74df,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPROD,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
003006a3-56cd-45bc-84aa-d1fc9cbfb817,EXCLUDE,True,10.0,EXCLUDE,True,8.0,LC,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
0032cdd7-a488-478e-8010-0ffe777e5137,EXCLUDE,False,8.0,EXCLUDE,True,8.0,SMELLREPROD,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
00348ad9-edf3-4f1d-b4e5-105c1e51cdd2,EXCLUDE,True,8.0,EXCLUDE,True,10.0,SMELLREPROD,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True


In [10]:
# Initialize DataFrames to store metrics
metric_df = pd.DataFrame()
select_metric_df = pd.DataFrame()
simple_metric_df = pd.DataFrame()
cot_metric_df = pd.DataFrame()
# Confidence threshold for filtering
confidence_threshold = 8# Example threshold

# Calculate metrics for each dataset
for d in final_df.Dataset.unique():
    # Filter data for the current dataset
    dataset_df = final_df[final_df["Dataset"] == d]
    
    # # Simple metrics
    # simple_metric_df[d] = calculate_metrics_for_classic_prompts(
    #     df=dataset_df,
    #     confidence_threshold=confidence_threshold,
    #     prediction_col_name="screened_decision",
    #     confidence_col_name="SIMPLE_Confidence",
    #     true_label_col_name="SIMPLE_Decision",
    #     is_true_label="SIMPLE_is_true"

    # )

    # Selection metrics
    select_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="SELECTION_Confidence",
        true_label_col_name="SELECTION_Decision",
        is_true_label="SELECTION_is_true"
    )

    # COT metrics
    cot_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="COT_Confidence",
        true_label_col_name="COT_Decision",
        is_true_label="COT_is_true"
    )

# Set metric names as index for all DataFrames
metric_names = [
    "Balanced Accuracy",
    "Recall",
    "Specificity",
    "NPV",
    "Precision",
    "MCC",
    "WSP (Work Saved Percentage)",
    "Accuracy in WSP",
]
# metric_df.index = metric_names
# simple_metric_df.index = metric_names
select_metric_df.index = metric_names
cot_metric_df.index = metric_names

# Transpose DataFrames for display
# print(f"Overall Metrics (without Confidence Threshold for Bernoulli Probability)")
# # display(metric_df.T)
# print(f"Simple Metrics (Confidence Threshold = {confidence_threshold})")
# display(simple_metric_df.T)
print(f"Selection Metrics (Confidence Threshold = {confidence_threshold})")
display(select_metric_df.T.sort_index())
print(f"COT Metrics (Confidence Threshold = {confidence_threshold})")
display(cot_metric_df.T.sort_index())

Selection Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
BEHAVE,0.855184,0.74026,0.970109,0.899244,0.912,0.759133,0.893836,0.902299
DTCPS,0.699286,0.678571,0.72,0.965517,0.162393,0.225791,0.942643,0.716931
ESM2,0.604455,0.714286,0.494624,0.958333,0.096154,0.106691,0.877193,0.51
ESPLE,0.591762,0.206897,0.976628,0.784182,0.75,0.313106,0.832814,0.781796
GAMESEFINAL,0.714198,0.68254,0.745856,0.818182,0.58371,0.414933,0.898858,0.724138
LC,0.657512,0.321767,0.993257,0.899061,0.886957,0.497609,0.83675,0.898441
MPM4CPS,0.753252,0.843137,0.663366,0.893333,0.558442,0.478357,0.741463,0.723684
SECSELFADAPT,0.573596,0.171429,0.975763,0.949345,0.307692,0.194509,0.826239,0.928209
SMELLREPROD,0.837789,0.756757,0.918821,0.994144,0.171779,0.334805,0.992411,0.915294
TESTNN,0.62384,0.727273,0.520408,0.894737,0.253968,0.191915,0.685714,0.558333


COT Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
BEHAVE,0.89882,0.834586,0.963054,0.946731,0.880952,0.812523,0.922945,0.931354
DTCPS,0.716308,0.681818,0.750799,0.971074,0.16129,0.239297,0.835411,0.746269
ESM2,0.612113,0.75,0.474227,0.978723,0.055556,0.087671,0.885965,0.485149
ESPLE,0.603898,0.234899,0.972896,0.856784,0.648148,0.323917,0.882658,0.843529
GAMESEFINAL,0.716669,0.713235,0.720102,0.878882,0.468599,0.388042,0.862969,0.718336
LC,0.653336,0.311047,0.995625,0.870492,0.938596,0.498121,0.724562,0.874486
MPM4CPS,0.814532,0.90566,0.723404,0.931507,0.648649,0.604115,0.717073,0.789116
SECSELFADAPT,0.577528,0.174603,0.980453,0.901608,0.536585,0.260662,0.766225,0.887978
SMELLREPROD,0.865521,0.794872,0.93617,0.994681,0.233083,0.40805,0.955633,0.932804
TESTNN,0.561728,0.666667,0.45679,0.804348,0.290323,0.10811,0.617143,0.509259


In [11]:
# Initialize DataFrames to store metrics
metric_df = pd.DataFrame()
bernoulli_metric_df = pd.DataFrame()
avg_metric_df = pd.DataFrame()
# Confidence threshold for filtering
confidence_threshold = 8# Example threshold

# Calculate metrics for each dataset
for d in final_df.Dataset.unique():
    # Filter data for the current dataset
    dataset_df = final_df[final_df["Dataset"] == d]

    # Selection metrics
    bernoulli_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="",
        true_label_col_name="Expected_Decision",
        is_true_label="Expected_is_true",
        is_bernoulli_distributed=True,
        is_bernoulli_confidence=True
    )

    # # COT metrics
    avg_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="",
        true_label_col_name="Avg_Decision",
        is_true_label="Avg_is_true",
        is_bernoulli_distributed=True,
    )

# Set metric names as index for all DataFrames
metric_names = [
    "Balanced Accuracy",
    "Recall",
    "Specificity",
    "NPV",
    "Precision",
    "MCC",
    "WSP (Work Saved Percentage)",
    "Accuracy in WSP",
]
# # metric_df.index = metric_names
# simple_metric_df.index = metric_names
bernoulli_metric_df.index = metric_names
avg_metric_df.index = metric_names

# Transpose DataFrames for display
# print(f"Overall Metrics (without Confidence Threshold for Bernoulli Probability)")
# # display(metric_df.T)
# print(f"Simple Metrics (Confidence Threshold = {confidence_threshold})")
# display(simple_metric_df.T)
print(f"Bernoulli Metrics (Confidence Threshold = {confidence_threshold})")
display(bernoulli_metric_df.T.sort_index())
print(f"Average Metrics (Confidence Threshold = {confidence_threshold})")
display(avg_metric_df.T.sort_index())

Bernoulli Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
BEHAVE,0.891971,0.813433,0.970509,0.935401,0.908333,0.813289,0.868151,0.928994
DTCPS,0.716308,0.681818,0.750799,0.971074,0.16129,0.239297,0.835411,0.746269
ESM2,0.610955,0.75,0.47191,0.976744,0.06,0.090299,0.815789,0.483871
ESPLE,0.645693,0.326087,0.9653,0.908012,0.576923,0.375904,0.753894,0.884298
GAMESEFINAL,0.725827,0.713235,0.73842,0.874194,0.502591,0.412524,0.820555,0.73161
LC,0.66909,0.342561,0.99562,0.893318,0.933962,0.528933,0.703317,0.895601
MPM4CPS,0.822222,0.911111,0.733333,0.942857,0.630769,0.608005,0.658537,0.792593
SECSELFADAPT,0.662809,0.344828,0.98079,0.979744,0.357143,0.331204,0.67411,0.961698
SMELLREPROD,0.908841,0.888889,0.928793,0.998668,0.122137,0.314294,0.953298,0.928353
TESTNN,0.574359,0.7,0.448718,0.853659,0.245614,0.121506,0.56,0.5


Average Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
BEHAVE,0.841817,0.72067,0.962963,0.886364,0.895833,0.731256,1.0,0.888699
DTCPS,0.720459,0.723404,0.717514,0.951311,0.253731,0.300677,1.0,0.718204
ESM2,0.571429,0.666667,0.47619,0.943396,0.098361,0.077235,1.0,0.491228
ESPLE,0.581024,0.188679,0.973369,0.809524,0.666667,0.277787,1.0,0.800623
GAMESEFINAL,0.680498,0.653266,0.707729,0.809392,0.517928,0.343746,1.0,0.690049
LC,0.587356,0.182432,0.99228,0.761153,0.9,0.33987,1.0,0.768915
MPM4CPS,0.719697,0.772727,0.666667,0.795918,0.635514,0.435395,1.0,0.712195
SECSELFADAPT,0.563977,0.154206,0.973749,0.86769,0.507692,0.219162,1.0,0.851361
SMELLREPROD,0.838503,0.754717,0.922289,0.99158,0.236686,0.393113,1.0,0.917104
TESTNN,0.523708,0.514706,0.53271,0.633333,0.411765,0.046243,1.0,0.525714


In [12]:
bernoulli_metric_df.T.sort_index()[['WSP (Work Saved Percentage)', 'Accuracy in WSP', "Recall", "Specificity", "NPV", "Precision", "MCC"]].to_clipboard(index=False, header=None)