In [1]:
import zipfile
import pandas as pd
from irrCAC.table import CAC
import warnings
from IPython.display import Markdown, display
import numpy as np

warnings.simplefilter("ignore")

In [2]:
zip_path = "./final annotations-20250313T022456Z-001.zip"
zf = zipfile.ZipFile(zip_path)


list_ = []
for file in zf.infolist():
    if file.filename.endswith(".xlsx"):
        df = pd.read_excel(zf.open(file.filename), skiprows=1, index_col=0)
        list_.append(df)

In [3]:
filenames = [
    file.filename.replace("final annotations/", "").replace(".xlsx", "")
    for file in zf.infolist()
    if file.filename.endswith(".xlsx")
]

In [4]:
def progress_bar(percentage):
    bar_length = 20  # Total length of the bar
    filled_length = int(bar_length * (percentage / 100))
    bar = "[" + "#" * filled_length + "-" * (bar_length - filled_length) + "]"
    return bar

In [5]:
completed = []

markdown_table = (
    "| i | spreadsheet | nans | progress | progress_bar |\n|---|---|---|---|---|\n"
)
for i, (filename, dframe) in enumerate(zip(filenames, list_)):
    nans = dframe.iloc[:, 2:-1].isna().sum().sum()
    completed.append(840 - nans)
    percentage = ((840 - nans) / 840) * 100  # Convert to percentage
    markdown_table += f"| {i} | {filename} | {nans} | {percentage:.1f}% | <progress id='file' max='100' value={percentage}></progress> |\n"

In [6]:
pgrs = sum(completed) / (840 * 25) * 100

display(
    Markdown(
        f"""<label for="file">Annotation progress {pgrs:.2f}%:</label>

<progress id="file" max="100" value="{pgrs}"></progress>"""
    )
)

display(Markdown(markdown_table))

<label for="file">Annotation progress 15.39%:</label>

<progress id="file" max="100" value="15.39047619047619"></progress>

| i | spreadsheet | nans | progress | progress_bar |
|---|---|---|---|---|
| 0 | AI_debunkings20 | 800 | 4.8% | <progress id='file' max='100' value=4.761904761904762></progress> |
| 1 | AI_debunkings | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 2 | AI_debunkings8 | 805 | 4.2% | <progress id='file' max='100' value=4.166666666666666></progress> |
| 3 | AI_debunkings16 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 4 | AI_debunkings9 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 5 | AI_debunkings21 | 527 | 37.3% | <progress id='file' max='100' value=37.26190476190476></progress> |
| 6 | AI_debunkings4 | 714 | 15.0% | <progress id='file' max='100' value=15.0></progress> |
| 7 | AI_debunkings7 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 8 | AI_debunkings19 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 9 | AI_debunkings14 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 10 | AI_debunkings22 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 11 | AI_debunkings6 | 805 | 4.2% | <progress id='file' max='100' value=4.166666666666666></progress> |
| 12 | AI_debunkings5 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 13 | AI_debunkings0 | 770 | 8.3% | <progress id='file' max='100' value=8.333333333333332></progress> |
| 14 | AI_debunkings1 | 512 | 39.0% | <progress id='file' max='100' value=39.04761904761905></progress> |
| 15 | AI_debunkings18 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 16 | AI_debunkings3 | 0 | 100.0% | <progress id='file' max='100' value=100.0></progress> |
| 17 | AI_debunkings23 | 2 | 99.8% | <progress id='file' max='100' value=99.76190476190476></progress> |
| 18 | AI_debunkings10 | 427 | 49.2% | <progress id='file' max='100' value=49.166666666666664></progress> |
| 19 | AI_debunkings13 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 20 | AI_debunkings17 | 793 | 5.6% | <progress id='file' max='100' value=5.595238095238096></progress> |
| 21 | AI_debunkings12 | 728 | 13.3% | <progress id='file' max='100' value=13.333333333333334></progress> |
| 22 | AI_debunkings11 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 23 | AI_debunkings2 | 840 | 0.0% | <progress id='file' max='100' value=0.0></progress> |
| 24 | AI_debunkings24 | 805 | 4.2% | <progress id='file' max='100' value=4.166666666666666></progress> |


In [7]:
model_map = pd.read_csv("ModelMap.csv", index_col=0)
ai_annotation = pd.read_excel(
    "/Users/francisco/Library/CloudStorage/OneDrive-Personal/Francisco/GitHub/eval-agent/app/automated_eval3.xlsx",
    index_col=0,
)


def human_ai_interrater_agreement_table(
    human_annotation, column, ai_annotation=ai_annotation, model_map=model_map
):

    # Options map:
    options_map = {
        "Model": [],
        "claim": [],
        "rebuttal": [],
        "fact_1_relevance": [
            "completely irrelevant",
            "mostly irrelevant",
            "mostly relevant",
            "completely relevant",
        ],
        "fact_1_accuracy": [
            "completely inaccurate",
            "mostly inaccurate",
            "mostly accurate",
            "completely accurate",
        ],
        "familiarity": [
            "completely unfamiliar",
            "mostly unfamiliar",
            "mostly familiar",
            "completely familiar",
        ],
        "fact_2_relevance": [
            "completely irrelevant",
            "mostly irrelevant",
            "mostly relevant",
            "completely relevant",
        ],
        "fact_2_accuracy": [
            "completely inaccurate",
            "mostly inaccurate",
            "mostly accurate",
            "completely accurate",
        ],
        "fallacy_correctness": [
            "completely incorrect",
            "mostly incorrect",
            "mostly correct",
            "completely correct",
        ],
        "fallacy_clarity": [
            "completely unclear",
            "mostly unclear",
            "mostly clear",
            "completely clear",
        ],
        "comments": [],
    }

    # 1. Add model key to annotation, and drop nan rows on "Model" column.
    ann = pd.concat([model_map, human_annotation], axis=1).dropna(subset=["Model"])
    # rename columns:
    ann.columns = list(options_map.keys())

    # 2. Keep only the rows where the index exists in both DataFrames, normalize text
    common_index = ann.index.intersection(ai_annotation.index)
    ann = ann.loc[common_index, column].apply(lambda x: str(x).lower().strip())
    ai = ai_annotation.loc[common_index, column].apply(lambda x: str(x).lower().strip())

    # 4. create confusion matrix
    init = pd.DataFrame(columns=[options_map[column]], index=[options_map[column]])
    temp = pd.crosstab(ai, ann)

    for idx in init.index:
        for col in init.columns:
            try:
                init.loc[idx[0], col[0]] = temp.loc[idx[0], col[0]]
            except KeyError:
                init.loc[idx[0], col[0]] = 0

    init = init.astype(float)

    # 5. calculate scores

    return init

In [16]:
markdown_table = "| i | percent | cohen | gwet AC1 |\n" "|---|------|-------|------|\n"

for i, ann in enumerate(list_):
    cm = human_ai_interrater_agreement_table(ann, "fact_1_relevance")
    table = CAC(cm)
    percent = table.pa2()["est"]["coefficient_value"]
    cohen = table.cohen()["est"]["coefficient_value"]
    gwet = table.gwet()["est"]["coefficient_value"]
    if not np.isnan(cohen) or not np.isnan(gwet):
        markdown_table += f"| {i} | {percent:.2f} |{cohen:.2f} | {gwet:.2} |\n"

In [17]:
display(
    Markdown(
        """## Inter-rater agreement table  
### Ai-Human agreement table: Fact1 relevance"""
    )
)

display(Markdown(markdown_table))

## Inter-rater agreement table  
### Ai-Human agreement table: Fact1 relevance

| i | percent | cohen | gwet AC1 |
|---|------|-------|------|
| 0 | 0.00 |0.00 | -0.2 |
| 5 | 0.31 |0.03 | 0.16 |
| 6 | 0.33 |-0.00 | 0.18 |
| 13 | 0.33 |0.05 | 0.16 |
| 14 | 0.31 |0.00 | 0.17 |
| 16 | 0.34 |0.08 | 0.18 |
| 17 | 0.27 |-0.05 | 0.096 |
| 18 | 0.21 |-0.04 | -0.0078 |
| 20 | 0.00 |-0.57 | -0.26 |
| 21 | 0.30 |0.00 | 0.15 |


In [10]:
def human_human_interrater_agreement_table(
    human_annotation_a, human_annotation_b, column, model_map=model_map
):

    # Options map:
    options_map = {
        "Model": [],
        "claim": [],
        "rebuttal": [],
        "fact_1_relevance": [
            "completely irrelevant",
            "mostly irrelevant",
            "mostly relevant",
            "completely relevant",
        ],
        "fact_1_accuracy": [
            "completely inaccurate",
            "mostly inaccurate",
            "mostly accurate",
            "completely accurate",
        ],
        "familiarity": [
            "completely unfamiliar",
            "mostly unfamiliar",
            "mostly familiar",
            "completely familiar",
        ],
        "fact_2_relevance": [
            "completely irrelevant",
            "mostly irrelevant",
            "mostly relevant",
            "completely relevant",
        ],
        "fact_2_accuracy": [
            "completely inaccurate",
            "mostly inaccurate",
            "mostly accurate",
            "completely accurate",
        ],
        "fallacy_correctness": [
            "completely incorrect",
            "mostly incorrect",
            "mostly correct",
            "completely correct",
        ],
        "fallacy_clarity": [
            "completely unclear",
            "mostly unclear",
            "mostly clear",
            "completely clear",
        ],
        "comments": [],
    }

    # 1. Add model key to annotation, and drop nan rows on "Model" column.
    annA = pd.concat([model_map, human_annotation_a], axis=1).dropna(subset=["Model"])
    annB = pd.concat([model_map, human_annotation_b], axis=1).dropna(subset=["Model"])
    # rename columns:
    annA.columns = list(options_map.keys())
    annB.columns = list(options_map.keys())

    # 2. Keep only the rows where the index exists in both DataFrames, normalize text
    common_index = annA.index.intersection(annB.index)
    annA = annA.loc[common_index, column].apply(lambda x: str(x).lower().strip())
    annB = annB.loc[common_index, column].apply(lambda x: str(x).lower().strip())

    # 4. create confusion matrix
    init = pd.DataFrame(columns=[options_map[column]], index=[options_map[column]])
    temp = pd.crosstab(annA, annB)

    for idx in init.index:
        for col in init.columns:
            try:
                init.loc[idx[0], col[0]] = temp.loc[idx[0], col[0]]
            except KeyError:
                init.loc[idx[0], col[0]] = 0

    init = init.astype(float)

    # 5. calculate scores

    return init

In [11]:
init = pd.DataFrame(
    columns=[f"Ann{i}" for i in range(len(list_))],
    index=[f"Ann{i}" for i in range(len(list_))],
)


for i, ann_A in enumerate(list_):
    for j, ann_B in enumerate(list_):
        if i == j:
            continue
        cm = human_human_interrater_agreement_table(
            ann_A, ann_B, "fact_1_relevance", model_map=model_map
        )
        table = CAC(cm)
        init.iat[i, j] = table.gwet()["est"]["coefficient_value"]

In [12]:
display(
    Markdown(
        """## Inter-rater agreement table  
### Gwet's AC1 scores: Fact1 relevance"""
    )
)


display(
    Markdown(
        init.dropna(axis=1, how="all")
        .dropna(axis=0, how="all")
        .fillna("")
        .to_markdown()
    )
)

## Inter-rater agreement table  
### Gwet's AC1 scores: Fact1 relevance

|       | Ann0   | Ann5    | Ann6    | Ann13   | Ann14   | Ann16   | Ann17   | Ann18   | Ann20   | Ann21   |
|:------|:-------|:--------|:--------|:--------|:--------|:--------|:--------|:--------|:--------|:--------|
| Ann0  |        | 1.0     | 1.0     | 1.0     | 1.0     | 1.0     | 1.0     | 1.0     | -0.2    | 1.0     |
| Ann5  | 1.0    |         | 0.76211 | 0.68889 | 0.94342 | 0.72226 | 0.75215 | 0.65926 | 0.20442 | 1.0     |
| Ann6  | 1.0    | 0.76211 |         | 0.6152  | 0.80758 | 0.52818 | 0.63265 | 0.42246 | 0.17714 | 0.78723 |
| Ann13 | 1.0    | 0.68889 | 0.6152  |         | 0.73034 | 0.47952 | 0.35867 | 0.33498 | 0.17714 | 0.63265 |
| Ann14 | 1.0    | 0.94342 | 0.80758 | 0.73034 |         | 0.82422 | 0.79791 | 0.65949 | 0.20442 | 1.0     |
| Ann16 | 1.0    | 0.72226 | 0.52818 | 0.47952 | 0.82422 |         | 0.71994 | 0.58884 | 0.17714 | 0.67093 |
| Ann17 | 1.0    | 0.75215 | 0.63265 | 0.35867 | 0.79791 | 0.71994 |         | 0.54618 | 0.18644 | 0.78723 |
| Ann18 | 1.0    | 0.65926 | 0.42246 | 0.33498 | 0.65949 | 0.58884 | 0.54618 |         | 0.16279 | 0.67093 |
| Ann20 | -0.2   | 0.20442 | 0.17714 | 0.17714 | 0.20442 | 0.17714 | 0.18644 | 0.16279 |         | 0.20442 |
| Ann21 | 1.0    | 1.0     | 0.78723 | 0.63265 | 1.0     | 0.67093 | 0.78723 | 0.67093 | 0.20442 |         |