In [8]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import openpyxl
from openpyxl.styles import PatternFill

In [9]:
def calculate_kappa_and_highlight_disagreements(df: pd.DataFrame, column1: str, column2: str, output_file: str):
    """
    Calculate the Cohen's Kappa score for the agreement between two label columns, 
    highlight the disagreement rows, and save the results to an Excel file.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - column1 (str): The name of the first label column (e.g., 'Label_Patrick').
    - column2 (str): The name of the second label column (e.g., 'Label_Leuson').
    - output_file (str): The file path to save the resulting DataFrame with highlighted rows.
    
    Returns:
    - kappa_score (float): The Cohen's Kappa score.
    """
    
    # Calculate Cohen's Kappa score
    kappa_score = cohen_kappa_score(df[column1], df[column2])
    
    # Highlight the disagreement rows
    df['Disagreement'] = df[column1] != df[column2]
    
    # Save to Excel
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Kappa Results')
        
        # Access the openpyxl workbook and sheet
        workbook = writer.book
        sheet = workbook['Kappa Results']
        
        # Define the red fill for disagreement
        red_fill = PatternFill(start_color='FF0000', end_color='FF0000', fill_type='solid')
        
        # Loop through rows and apply red fill for disagreement
        for row_idx, row in df.iterrows():
            if row['Disagreement']:  # If there's disagreement
                for col_idx, col_name in enumerate(df.columns):
                    cell = sheet.cell(row=row_idx + 2, column=col_idx + 1)  # +2 to account for header row
                    cell.fill = red_fill

    # Return the Kappa score
    return kappa_score

In [1]:
# Load your dataframe
df = pd.read_excel('RQ2_Dataset.xlsx')

# Call the function
kappa = calculate_kappa_and_highlight_disagreements(df, 'Label_Patrick', 'Label_Leuson', 'disagreement_highlighted.xlsx')

# Print Cohen's Kappa score
print(f"Cohen's Kappa score: {kappa}")


NameError: name 'pd' is not defined

In [14]:
# Load your dataframe
df = pd.read_excel('Second_round_labelling.xlsx')

# Call the function
kappa = calculate_kappa_and_highlight_disagreements(df, 'Label_Patrick', 'Label_Leuson', 'second_disagreement_highlighted.xlsx')

# Print Cohen's Kappa score
print(f"Cohen's Kappa score: {kappa}")

Cohen's Kappa score: 0.8092378344961583
