In [1]:
import pandas as pd
from scipy.stats import bernoulli
from sklearn.metrics import (
    balanced_accuracy_score,
    matthews_corrcoef,
    recall_score,
    precision_score,
    confusion_matrix,
    cohen_kappa_score,
)
from statsmodels.stats.inter_rater import fleiss_kappa
import numpy as np
from IPython.display import display
from warnings import filterwarnings
filterwarnings("ignore")
from analysis_helper import (
    calculate_metrics_for_classic_prompts,
    calculate_fleiss_kappa_for_group,
    calculate_avg_decision,
    calculate_expected_value
)

In [9]:
df = pd.read_csv("final_gpt4_data_parsed_2_bibtex.csv")
df["Decision_heatmap"] = df['Decision']
df["Decision"] = df['ParsedDecision_U2']
df.drop(columns=['ParsedDecision_U2'], inplace=True)
df.dropna(subset=["Decision"], inplace=True)
df.Decision = df.Decision.apply(lambda x: x.split("_")[1] if "MAYBE" in x else x)
# df = df[~df.Decision.str.contains("MAYBE")]
print(df.shape)
df.set_index("Article", inplace=True)
df.sort_index(inplace=True)
display(df.head())
# Filter rows based on "Prompt" column
# simple = df.loc[df["Prompt"].str.contains("SH0-RQy-SIMPLE")]
selection = df.loc[df["Prompt"].str.contains("SH0-RQy-SELECTION")]
cot = df.loc[df["Prompt"].str.contains("SH0-RQy-COT")]
# print(simple.shape, selection.shape, cot.shape)
print( selection.shape, cot.shape)
index = cot.index
selection = selection.loc[index]
print(selection.shape, cot.shape)

(3766, 8)


Unnamed: 0_level_0,Project,Prompt,Dataset,Confidence,Decision,is_true,Decision_heatmap
Article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
003157a6-50b3-4882-9782-daf11e3b9d2b,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-COT-BIBTEX...,Std-U2-Cy-SH0-RQy-COT,SMELLREPRODBIBTEX,9.0,EXCLUDE,True,TN
003157a6-50b3-4882-9782-daf11e3b9d2b,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-SELECTION-...,Std-U2-Cy-SH0-RQy-SELECTION,SMELLREPRODBIBTEX,10.0,EXCLUDE,True,TN
004eb63b-8ab7-4e6a-8b62-85eddb290b18,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-COT-BIBTEX...,Std-U2-Cy-SH0-RQy-COT,SMELLREPRODBIBTEX,9.0,EXCLUDE,True,TN
004eb63b-8ab7-4e6a-8b62-85eddb290b18,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-SELECTION-...,Std-U2-Cy-SH0-RQy-SELECTION,SMELLREPRODBIBTEX,8.0,EXCLUDE,True,TN
00542c47-0bdf-4373-b838-a010deb3d31c,Std-U2-Cy-Rn-EXn-INn-A-SH0-EXPn-RQy-SELECTION-...,Std-U2-Cy-SH0-RQy-SELECTION,SMELLREPRODBIBTEX,8.0,EXCLUDE,True,TN


(1884, 7) (1882, 7)
(1882, 7) (1882, 7)


In [10]:
# print(np.unique(simple.index == selection.index), (simple.index == selection.index).sum())
# print(np.unique(simple.index == cot.index), (simple.index == cot.index).sum())
print(np.unique(cot.index == selection.index), (cot.index == selection.index).sum())

[ True] 1882


In [11]:
# Convert common_articles set to a list before creating final DataFrame
final_df = pd.DataFrame(index=list(selection.index))

# Merge Decisions for each Prompt into the final DataFrame
# final_df = final_df.merge(
#     simple[["Decision", "is_true", "Confidence"]].rename(
#         columns={
#             "Decision": "SIMPLE_Decision",
#             "is_true": "SIMPLE_is_true",
#             "Confidence": "SIMPLE_Confidence",
#         }
#     ),
#     how="left",
#     left_index=True,
#     right_index=True,
# )

final_df = final_df.merge(
    selection[["Decision", "is_true", "Confidence"]].rename(
        columns={
            "Decision": "SELECTION_Decision",
            "is_true": "SELECTION_is_true",
            "Confidence": "SELECTION_Confidence",
        }
    ),
    how="left",
    left_index=True,
    right_index=True,
)

final_df = final_df.merge(
    cot[["Decision", "is_true", "Confidence"]].rename(
        columns={
            "Decision": "COT_Decision",
            "is_true": "COT_is_true",
            "Confidence": "COT_Confidence",
        }
    ),
    how="left",
    left_index=True,
    right_index=True,
)

# Map the "Dataset" column to the common articles
article_to_dataset = df["Dataset"].to_dict()
final_df["Dataset"] = final_df.index.map(article_to_dataset)

# Sort the final DataFrame by index (optional)
final_df.sort_index(inplace=True)
display(final_df.head())


Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset
003157a6-50b3-4882-9782-daf11e3b9d2b,EXCLUDE,True,10.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX
004eb63b-8ab7-4e6a-8b62-85eddb290b18,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX
00542c47-0bdf-4373-b838-a010deb3d31c,EXCLUDE,True,8.0,INCLUDE,False,5.0,SMELLREPRODBIBTEX
005a0d35-8d5d-4069-8b61-844117f1cd30,EXCLUDE,True,9.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX
005f90fd-98d0-444a-9d68-aa082ef8bf1b,EXCLUDE,True,10.0,EXCLUDE,True,10.0,SMELLREPRODBIBTEX


In [12]:
%cd ..


/Users/gauransh/Code/PromptSLR


In [13]:
!pwd

/Users/gauransh/Code/PromptSLR


In [14]:
from utils.db_connector import DBConnector

db = DBConnector()

# Initialize the 'screened_decision' column with None
final_df["screened_decision"] = None

# Get the list of articles based on the index of final_df
articles = db.get_articles_by_articlekey(final_df.index.tolist())  # Use the index

print(len(articles))

# Loop through the articles and update 'screened_decision' in final_df
for article in articles:
    # Use the index directly for comparison
    if article.Key in final_df.index:
        final_df.loc[article.Key, "screened_decision"] = article.ScreenedDecision[
            :-1
        ].upper()

display(final_df.head())

1882


Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset,screened_decision
003157a6-50b3-4882-9782-daf11e3b9d2b,EXCLUDE,True,10.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX,EXCLUDE
004eb63b-8ab7-4e6a-8b62-85eddb290b18,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX,EXCLUDE
00542c47-0bdf-4373-b838-a010deb3d31c,EXCLUDE,True,8.0,INCLUDE,False,5.0,SMELLREPRODBIBTEX,EXCLUDE
005a0d35-8d5d-4069-8b61-844117f1cd30,EXCLUDE,True,9.0,EXCLUDE,True,9.0,SMELLREPRODBIBTEX,EXCLUDE
005f90fd-98d0-444a-9d68-aa082ef8bf1b,EXCLUDE,True,10.0,EXCLUDE,True,10.0,SMELLREPRODBIBTEX,EXCLUDE


In [15]:
# Initialize an empty dictionary to store results
results = []

datasets = df["Dataset"].unique()  # Get unique datasets

for dataset in datasets:
    # Initialize dictionary for current dataset results
    dataset_results = {"Dataset": dataset}

    # Filter subsets of decisions for the current dataset
    # simple_subset = simple[simple["Dataset"] == dataset]["Decision"]
    selection_subset = selection[selection["Dataset"] == dataset]["Decision"]
    cot_subset = cot[cot["Dataset"] == dataset]["Decision"]

    # # # Calculate Cohen's Kappa for Simple/Selection
    # if len(simple_subset) > 1 and len(selection_subset) > 1:  # Ensure valid input sizes
    #     dataset_results["Cohen_Simple_Selection"] = cohen_kappa_score(
    #         simple_subset, selection_subset
    #     )
    # else:
    #     dataset_results["Cohen_Simple_Selection"] = None

    # # Calculate Cohen's Kappa for Simple/COT
    # if len(simple_subset) > 1 and len(cot_subset) > 1:  # Ensure valid input sizes
    #     dataset_results["Cohen_Simple_COT"] = cohen_kappa_score(
    #         simple_subset, cot_subset
    #     )
    # else:
    #     dataset_results["Cohen_Simple_COT"] = None

    # Calculate Cohen's Kappa for Selection/COT
    if len(selection_subset) > 1 and len(cot_subset) > 1:  # Ensure valid input sizes
        dataset_results["Cohen_Selection_COT"] = cohen_kappa_score(
            selection_subset, cot_subset
        )
    else:
        dataset_results["Cohen_Selection_COT"] = None

    # Combine all subsets into a single DataFrame for Fleiss' Kappa
    combined = pd.concat(
        [
            # simple_subset.reset_index(drop=True),
            selection_subset.reset_index(drop=True),
            cot_subset.reset_index(drop=True),
        ],
        axis=1,
    )
    combined.columns = [ "SELECTION", "COT"]

    # Drop rows with missing values (if any)
    combined = combined.dropna()

    # Calculate Fleiss' Kappa if there are enough rows
    if len(combined) > 1:
        kappa = calculate_fleiss_kappa_for_group(combined)
        dataset_results["Fleiss_Kappa"] = kappa
    else:
        dataset_results["Fleiss_Kappa"] = None

    # Append the results for the dataset
    results.append(dataset_results)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
display(results_df)

Unnamed: 0,Dataset,Cohen_Selection_COT,Fleiss_Kappa
0,SMELLREPRODBIBTEX,0.58594,0.583487
1,TESTNNBIBTEX,0.481928,0.443366


### Check Expected value of Burnoulie's Random Variable

In [13]:
# Add Expected_Value column
final_df["Expected_Value"] = final_df.apply(calculate_expected_value, axis=1)

# Add Expected_Decision column based on Expected_Value
final_df["Expected_Decision"] = final_df["Expected_Value"].apply(
    lambda x: "INCLUDE" if x > 0.5 else "EXCLUDE"
)
final_df["Expected_is_true"] = final_df["Expected_Decision"] == final_df["screened_decision"]
# Add Avg_Decision column based on majority decision logic
final_df["Avg_Decision"] = final_df.apply(calculate_avg_decision, axis=1)
final_df["Avg_is_true"] = final_df["Avg_Decision"] == final_df["screened_decision"]
# Display the head of the DataFrame
final_df.head()

Unnamed: 0,SELECTION_Decision,SELECTION_is_true,SELECTION_Confidence,COT_Decision,COT_is_true,COT_Confidence,Dataset,screened_decision,Expected_Value,Expected_Decision,Expected_is_true,Avg_Decision,Avg_is_true
0017065d-90ca-4d08-b08a-a35c07f87812,EXCLUDE,True,8.0,EXCLUDE,True,10.0,RL4SE,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
001d34e1-3344-457d-8f91-810d0293109c,EXCLUDE,True,8.0,EXCLUDE,True,8.0,ESPLE,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
00212ea4-fc6f-4c5c-8269-345fcf9b74df,EXCLUDE,True,8.0,EXCLUDE,True,9.0,SMELLREPROD,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
002b6bdb-7206-4994-9560-ff9829f29ffa,EXCLUDE,True,8.0,EXCLUDE,True,9.0,RL4SE,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True
003006a3-56cd-45bc-84aa-d1fc9cbfb817,EXCLUDE,True,10.0,EXCLUDE,True,8.0,LC,EXCLUDE,0.0,EXCLUDE,True,EXCLUDE,True


In [18]:
# Initialize DataFrames to store metrics
metric_df = pd.DataFrame()
select_metric_df = pd.DataFrame()
simple_metric_df = pd.DataFrame()
cot_metric_df = pd.DataFrame()
# Confidence threshold for filtering
confidence_threshold = 8# Example threshold

# Calculate metrics for each dataset
for d in final_df.Dataset.unique():
    # Filter data for the current dataset
    dataset_df = final_df[final_df["Dataset"] == d]
    
    # # Simple metrics
    # simple_metric_df[d] = calculate_metrics_for_classic_prompts(
    #     df=dataset_df,
    #     confidence_threshold=confidence_threshold,
    #     prediction_col_name="screened_decision",
    #     confidence_col_name="SIMPLE_Confidence",
    #     true_label_col_name="SIMPLE_Decision",
    #     is_true_label="SIMPLE_is_true"

    # )

    # Selection metrics
    select_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="SELECTION_Confidence",
        true_label_col_name="SELECTION_Decision",
        is_true_label="SELECTION_is_true"
    )

    # COT metrics
    cot_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="COT_Confidence",
        true_label_col_name="COT_Decision",
        is_true_label="COT_is_true"
    )

# Set metric names as index for all DataFrames
metric_names = [
    "Balanced Accuracy",
    "Recall",
    "Specificity",
    "NPV",
    "Precision",
    "MCC",
    "WSP (Work Saved Percentage)",
    "Accuracy in WSP",
]
# metric_df.index = metric_names
# simple_metric_df.index = metric_names
select_metric_df.index = metric_names
cot_metric_df.index = metric_names

# Transpose DataFrames for display
# print(f"Overall Metrics (without Confidence Threshold for Bernoulli Probability)")
# # display(metric_df.T)
# print(f"Simple Metrics (Confidence Threshold = {confidence_threshold})")
# display(simple_metric_df.T)
print(f"Selection Metrics (Confidence Threshold = {confidence_threshold})")
display(select_metric_df.T.sort_index())
print(f"COT Metrics (Confidence Threshold = {confidence_threshold})")
display(cot_metric_df.T.sort_index())

Selection Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
SMELLREPRODBIBTEX,0.834274,0.745098,0.923451,0.99152,0.231707,0.386314,0.992398,0.918091
TESTNNBIBTEX,0.696318,0.823529,0.569106,0.958904,0.208955,0.256724,0.813953,0.6


COT Metrics (Confidence Threshold = 8)


Unnamed: 0,Balanced Accuracy,Recall,Specificity,NPV,Precision,MCC,WSP (Work Saved Percentage),Accuracy in WSP
SMELLREPRODBIBTEX,0.943505,0.945946,0.941065,0.998655,0.273438,0.491273,0.944444,0.941176
TESTNNBIBTEX,0.545009,0.695652,0.394366,0.8,0.271186,0.080051,0.546512,0.468085


In [17]:
# Initialize DataFrames to store metrics
metric_df = pd.DataFrame()
bernoulli_metric_df = pd.DataFrame()
avg_metric_df = pd.DataFrame()
# Confidence threshold for filtering
confidence_threshold = 8# Example threshold

# Calculate metrics for each dataset
for d in final_df.Dataset.unique():
    # Filter data for the current dataset
    dataset_df = final_df[final_df["Dataset"] == d]

    # Selection metrics
    bernoulli_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="",
        true_label_col_name="Expected_Decision",
        is_true_label="Expected_is_true",
        is_bernoulli_distributed=True,
        is_bernoulli_confidence=True
    )

    # # COT metrics
    avg_metric_df[d] = calculate_metrics_for_classic_prompts(
        df=dataset_df,
        confidence_threshold=confidence_threshold,
        prediction_col_name="screened_decision",
        confidence_col_name="",
        true_label_col_name="Avg_Decision",
        is_true_label="Avg_is_true",
        is_bernoulli_distributed=True,
    )

# Set metric names as index for all DataFrames
metric_names = [
    "Balanced Accuracy",
    "Recall",
    "Specificity",
    "NPV",
    "Precision",
    "MCC",
    "WSP (Work Saved Percentage)",
    "Accuracy in WSP",
]
# # metric_df.index = metric_names
# simple_metric_df.index = metric_names
bernoulli_metric_df.index = metric_names
avg_metric_df.index = metric_names

# Transpose DataFrames for display
# print(f"Overall Metrics (without Confidence Threshold for Bernoulli Probability)")
# # display(metric_df.T)
# print(f"Simple Metrics (Confidence Threshold = {confidence_threshold})")
# display(simple_metric_df.T)
print(f"Bernoulli Metrics (Confidence Threshold = {confidence_threshold})")
display(bernoulli_metric_df.T.sort_index())
print(f"Average Metrics (Confidence Threshold = {confidence_threshold})")
display(avg_metric_df.T.sort_index())

KeyError: 'Expected_Decision'

In [33]:
cot_metric_df.T.sort_index()[['WSP (Work Saved Percentage)', 'Balanced Accuracy', 'Accuracy in WSP', "Recall", "Specificity", "NPV", "Precision", "MCC"]].to_clipboard(index=False, header=None)

In [20]:
for _ in select_metric_df.T.sort_index().index:
    print(f"{_}")

BEHAVE
DTCPS
ESM2
ESPLE
GAMESEFINAL
LC
MPM4CPS
RL4SE
SECSELFADAPT
SMELLREPROD
TESTNN
TRUSTSE
UPDATECOLLABMDE


In [None]:
BEHAVE	0.8681506849315068	0.9289940828402367	0.8134328358208955	0.9705093833780161	0.9354005167958657	0.9083333333333333	0.81328874754513
DTCPS	0.8354114713216958	0.746268656716418	0.6818181818181818	0.7507987220447284	0.9710743801652892	0.16129032258064516	0.23929732109382376
ESM2	0.8157894736842105	0.4838709677419355	0.75	0.47191011235955055	0.9767441860465116	0.06	0.09029898368277256
ESPLE	0.7538940809968847	0.8842975206611571	0.32608695652173914	0.9652996845425867	0.9080118694362018	0.5769230769230769	0.37590366472586995
GAMESEFINAL	0.8205546492659054	0.731610337972167	0.7132352941176471	0.7384196185286104	0.8741935483870967	0.5025906735751295	0.41252447788808533
LC	0.7033171822586657	0.895601483836778	0.34256055363321797	0.9956195244055068	0.8933183604716451	0.9339622641509434	0.5289327236944147
MPM4CPS	0.6585365853658537	0.7925925925925926	0.9111111111111111	0.7333333333333333	0.9428571428571428	0.6307692307692307	0.6080052052987127
RL4SE	0.921028466483012	0.9501495513459621	0.8113207547169812	0.9578947368421052	0.9891304347826086	0.5180722891566265	0.6246184376201414
SECSELFADAPT	0.6741102581995813	0.9616977225672878	0.3448275862068966	0.9807897545357525	0.9797441364605544	0.35714285714285715	0.3312042375754402
SMELLREPROD	0.9532983070636311	0.9283527250459277	0.8888888888888888	0.9287925696594427	0.9986684420772304	0.12213740458015267	0.31429397209614923
TESTNN	0.56	0.5	0.7	0.44871794871794873	0.8536585365853658	0.24561403508771928	0.1215056098424133
TRUSTSE	0.2660377358490566	0.5602836879432624	0.40860215053763443	0.8541666666666666	0.4270833333333333	0.8444444444444444	0.26711239769950584
UPDATECOLLABMDE	0.8982857142857142	0.9402035623409669	0.4482758620689655	0.9793956043956044	0.9570469798657718	0.6341463414634146	0.502828514204309
