In [32]:
import pandas as pd
import json
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import wilcoxon

with open("results/evaluation/bart-test-extrinsic-100-summaries.json", "r") as f:
    json_sums = json.load(f)
df = pd.DataFrame(json_sums.values(), index=list(json_sums.keys()))


def get_baseline_diff(df, model, metric):
    return pd.crosstab(index=df[f"baseline-bart_{metric}"], columns=df[f"{model}_{metric}"])

for model in ["corrector", "pinocchio", "meng-rl", "fbs_classifier", "fbs_oracle"]:

    for metric in ["is_factual", "has_extrinsic_and_fully_factual"]:
        diff_metric = get_baseline_diff(df, model, metric)
        baseline_mean = df[f"baseline-bart_{metric}"].mean()
        model_mean = df[f"{model}_{metric}"].mean()
        print(f"{model} {metric}: {model_mean} (baseline diff: {model_mean - baseline_mean})")
        print(f"{model} {metric} baseline diff mcnemar p-value: {mcnemar(diff_metric).pvalue}")
        wilcoxon_p = wilcoxon(
            df[f"baseline-bart_{metric}"].astype(int), 
            df[f"{model}_{metric}"].astype(int)
        ).pvalue
        print(f"{model} {metric} baseline diff wilcoxon p-value: {wilcoxon_p}")

    print()

corrector is_factual: 0.42 (baseline diff: 0.0)
corrector is_factual baseline diff mcnemar p-value: 1.0
corrector is_factual baseline diff wilcoxon p-value: 1.0
corrector has_extrinsic_and_fully_factual: 0.45 (baseline diff: 0.010000000000000009)
corrector has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 1.0
corrector has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.7054569861112734

pinocchio is_factual: 0.43 (baseline diff: 0.010000000000000009)
pinocchio is_factual baseline diff mcnemar p-value: 1.0
pinocchio is_factual baseline diff wilcoxon p-value: 0.5637028616507731
pinocchio has_extrinsic_and_fully_factual: 0.43 (baseline diff: -0.010000000000000009)
pinocchio has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 1.0
pinocchio has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.5637028616507731

meng-rl is_factual: 0.55 (baseline diff: 0.13000000000000006)
meng-rl is_factual baseline diff mcnemar p-value: 0.004425048828125
m



In [15]:
import pandas as pd
import json
from statsmodels.stats.contingency_tables import mcnemar

with open("results/evaluation/bart-extrinsic-fully-annotated-139-summaries.json", "r") as f:
    json_sums = json.load(f)
df = pd.DataFrame(json_sums.values(), index=list(json_sums.keys()))


def get_baseline_diff(df, model, metric):
    return pd.crosstab(index=df[f"baseline-bart_{metric}"], columns=df[f"{model}_{metric}"])

for model in ["fbs_classifier"]:

    for metric in ["is_factual", "has_extrinsic_and_fully_factual"]:
        diff_metric = get_baseline_diff(df, model, metric)
        baseline_mean = df[f"baseline-bart_{metric}"].mean()
        model_mean = df[f"{model}_{metric}"].mean()
        print(f"{model} {metric}: {model_mean} (baseline diff: {model_mean - baseline_mean})")
        print(f"{model} {metric} baseline diff p-value: {mcnemar(diff_metric).pvalue}")

    print()

fbs_classifier is_factual: 0.5827338129496403 (baseline diff: 0.1007194244604317)
fbs_classifier is_factual baseline diff p-value: 0.001312255859375
fbs_classifier has_extrinsic_and_fully_factual: 0.5035971223021583 (baseline diff: 0.007194244604316502)
fbs_classifier has_extrinsic_and_fully_factual baseline diff p-value: 1.0

