In [None]:
import os

import pandas as pd

In [None]:
df_1 = pd.read_csv("gold-v2-annotator-1.csv")
df_2 = pd.read_csv("gold-v2-annotator-2.csv")

# Find columns where values differ
diff_cols = []
for col in df_1.columns:
    if not df_1[col].equals(df_2[col]):
        diff_cols.append(col)

# Create a list of columns to keep from df_1 (those that are identical)
identical_cols = [col for col in df_1.columns if col not in diff_cols]

# Create a list of columns to merge with suffixes (those that differ)
merge_cols = diff_cols

# Merge with suffixes only for columns that differ
df = pd.merge(
    df_1[identical_cols + merge_cols],
    df_2[merge_cols],
    how="left",
    left_index=True,
    right_index=True,
    suffixes=("_annotator_1", "_annotator_2"),
)

print(diff_cols)

In [None]:
# Show rows where gold_answer is different between annotators, handling NaN values
df[
    ~(df["gold_answer_annotator_1"].isna() & df["gold_answer_annotator_2"].isna())
    & (df["gold_answer_annotator_1"] != df["gold_answer_annotator_2"])
]

In [None]:
# Create a new dataframe for the combined results
df = (
    df.copy()
)  # Start with the merged dataframe that has both annotator columns

# Add combined columns with new names
for col in diff_cols:
    # For gold_answer: yes if both yes, no if both no, NaN if disagree
    if col == "gold_answer":
        # Create masks for agreement
        mask_both_yes = (df[f"{col}_annotator_1"] == "yes") & (
            df[f"{col}_annotator_2"] == "yes"
        )
        mask_both_no = (df[f"{col}_annotator_1"] == "no") & (
            df[f"{col}_annotator_2"] == "no"
        )

        # Start with NaN values
        df[f"{col}"] = pd.NA
        # Set to yes where both say yes
        df.loc[mask_both_yes, f"{col}"] = "yes"
        # Set to no where both say no
        df.loc[mask_both_no, f"{col}"] = "no"
    else:
        # For other columns: True if either is True
        # Handle NaN values by treating them as False
        col1 = df[f"{col}_annotator_1"].fillna(False)
        col2 = df[f"{col}_annotator_2"].fillna(False)
        df[f"{col}"] = col1 | col2

In [None]:
df

In [None]:
df["gold_answer"].value_counts(dropna=False)

In [None]:
df["gold_stateful"].value_counts(dropna=False)

In [None]:
# df.to_csv("gold-v2.csv", index=False)