In [5]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from sec_certs.dataset import CCDataset
from pathlib import Path

REPO_ROOT = Path("./../../../").resolve()
REF_ANNOTATIONS_PATH = REPO_ROOT / "dataset/reference_annotations"

def get_agreement_df(adam_df_path, jano_df_path):
    df_adam = pd.read_csv(adam_df_path)
    df_jano = pd.read_csv(jano_df_path)
    df_jano.label = df_jano.label.map(lambda x: x.upper())
    df = pd.merge(df_adam, df_jano, on=["dgst", "referenced_cert_id"], suffixes=("_adam", "_jano"))
    df["agreement"] = df.label_adam == df.label_jano
    return df

In [10]:
# Load dataframes with individual labels and create new `agreement` column
agreement_train = get_agreement_df(REF_ANNOTATIONS_PATH / "adam_train.csv", REF_ANNOTATIONS_PATH / "jano_train.csv")
agreement_valid = get_agreement_df(REF_ANNOTATIONS_PATH / "adam_valid.csv", REF_ANNOTATIONS_PATH / "jano_valid.csv")
agreement_all = pd.concat([agreement_train, agreement_valid])
conflicting_df = agreement_all.loc[~agreement_all.agreement].copy()

In [7]:
# Measure Cohen's kappa
# For interpretation, see https://hrcak.srce.hr/file/132393.
# We're aiming for 0.9+.
cohen_kappa_score(agreement_all.label_adam, agreement_all.label_jano)

0.7617784008411355

In [11]:
# Load CC dataset and fill-in paths to pdfs of conflicting instances
dset = CCDataset.from_json(REPO_ROOT / "dataset/cc_final_run_may_23/dataset.json")
conflicting_df["report_path"] = conflicting_df.dgst.map(lambda x: "file:/" + str(dset[x].state.report_pdf_path))
conflicting_df["st_path"] = conflicting_df.dgst.map(lambda x: "file:/" + str(dset[x].state.st_pdf_path))
conflicting_df.loc[:, ["referenced_cert_id", "label_adam", "label_jano", "comment", "report_path", "st_path"]].to_csv(REF_ANNOTATIONS_PATH / "conflicting.csv", index=False)