In [1]:
# Install the krippendorff module if not already installed
%pip install krippendorff

import pandas as pd
import krippendorff
import numpy as np


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
def summarise_categorical(df, column):
    """
    Summarise a categorical column in a DataFrame.
    """
    summary = pd.DataFrame({
        'Count': df[column].value_counts(),
        'Proportion': df[column].value_counts(normalize=True)
        })
    
    summary["Proportion"] = summary["Proportion"].apply(lambda x: f"{x:.2%}")
    display(summary)

In [3]:
# load the annotations
df = pd.read_csv("./final_GK_CH_annotations.csv")

# summarise the 'final_label' column
summarise_categorical(df, "final_label")

Unnamed: 0_level_0,Count,Proportion
final_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0 - clear no,939,93.90%
1 - clear yes,51,5.10%
borderline,9,0.90%
1- clear yes,1,0.10%


In [4]:
def krippendorff_alpha(data, level_of_measurement='nominal', category_order=None):
    """Calculate Krippendorff's alpha for the given data."""
    # Create a pivot table with ratings as values
    pivot_table = data[["annot1_label", "annot2_label"]]
    
    # Convert string category names to numeric values
    if category_order is not None:
        # For ordinal data with specified order
        # Check that all categories in the data are in the category_order
        missing_categories = set(pivot_table.values.flatten()) - set(category_order)
        if missing_categories:
            print(f"Warning: The following categories are not in the specified order: {missing_categories}")

        # Create a mapping from category to numeric value
        category_map = {cat: i for i, cat in enumerate(category_order)}
        
        # Apply the mapping to convert categories to ordered numeric values
        # Any values not in category_order will become NaN
        pivot_table = pivot_table.apply(lambda x: x.map(category_map))
    else:
        # For nominal data without specified order
        pivot_table = pivot_table.apply(lambda x: pd.factorize(x)[0])
        # replace any -1 values with NaN
        pivot_table = pivot_table.replace(-1, np.nan)

    # Convert to a numpy array
    ratings_array = pivot_table.to_numpy().T

    # Calculate Krippendorff's alpha
    alpha = krippendorff.alpha(reliability_data=ratings_array, level_of_measurement=level_of_measurement)
    return alpha

# summarise the 'disagreement' column, after filling NaN values with False
df["disagreement"] = df["disagreement"].fillna(False)
summarise_categorical(df, "disagreement")

# Calculate Krippendorff's alpha for the 'annot1_label' and 'annot2_label' columns, using ordinal measurement
alpha = krippendorff_alpha(df, level_of_measurement="ordinal", category_order=["0 - clear no", "borderline", "1 - clear yes"])
print(f"Krippendorff's alpha: {alpha:.4f}")


Unnamed: 0_level_0,Count,Proportion
disagreement,Unnamed: 1_level_1,Unnamed: 2_level_1
False,986,98.60%
True,14,1.40%


Krippendorff's alpha: 0.9036


In [5]:
# load the prompt templates
templates = pd.read_csv("./relevance_templates.csv")

def create_prompt(template, user_prompt):
    
    # remove linebreaks and tabs
    user_prompt = user_prompt.replace("\n", " ").replace("\t", " ")

    # collapse multiple spaces into one
    user_prompt = " ".join(user_prompt.split())

    return template.format(user_prompt = user_prompt)

df_dict = {}

for i, row in templates.iterrows():
    df_dict[row["id"]] = df.copy()
    df_dict[row["id"]]["eval_prompt"] = df_dict[row["id"]]["user_prompt"].apply(lambda x: create_prompt(row["prompt_template"], x))

# save the eval prompt dfs to csv files
for i in df_dict:
    df_dict[i].to_csv("./eval_prompts/relevance_160424_prompts_{}.csv".format(i), index=False)


In [None]:
all_clean_df = pd.read_csv("/Users/greg/Desktop/newFolderLLM/issuebench/1_dataset_construction/1_preprocessing/clean/all_clean.csv")
print(all_clean_df.columns)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/greg/Desktop/newFolderLLM/issuebench/1_dataset_construction/clean/all_clean.csv'

In [44]:
import os
print(os.getcwd())

/Users/greg/Desktop/newFolderLLM/issuebench/1_dataset_construction/2_relevance_filtering


In [None]:
# apply best prompt template (see notebook 2_) to all clean samples

# all_clean_df = pd.read_csv("../data/clean/all_clean.csv")
all_clean_df = pd.read_csv("/Users/greg/Desktop/newFolderLLM/issuebench/1_dataset_construction/2_relevance_filtering/final_GK_CH_annotations.csv")
all_clean_df["eval_prompt"] = all_clean_df["user_prompt"].apply(lambda x: create_prompt(templates.iloc[4]["prompt_template"], x))
all_clean_df.to_csv("../data/clean/all_clean_prompts.csv", index=False)