## BART 100

In [14]:
import pandas as pd
import json
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import wilcoxon

with open("results/evaluation/bart-test-extrinsic-100-summaries.json", "r") as f:
    json_sums = json.load(f)
df = pd.DataFrame(json_sums.values(), index=list(json_sums.keys()))


def get_baseline_diff(df, model, metric):
    return pd.crosstab(index=df[f"baseline-bart_{metric}"], columns=df[f"{model}_{metric}"])

for model in ["corrector", "pinocchio", "rl-fact", "gef_classifier", "gef_oracle"]:

    for metric in ["is_factual", "has_extrinsic_and_fully_factual"]:
        diff_metric = get_baseline_diff(df, model, metric)
        baseline_mean = df[f"baseline-bart_{metric}"].mean()
        model_mean = df[f"{model}_{metric}"].mean()
        print(f"{model} {metric}: {model_mean} (baseline diff: {model_mean - baseline_mean})")
        print(f"{model} {metric} baseline diff mcnemar p-value: {mcnemar(diff_metric).pvalue}")
        wilcoxon_p = wilcoxon(
            df[f"baseline-bart_{metric}"].astype(int), 
            df[f"{model}_{metric}"].astype(int)
        ).pvalue
        print(f"{model} {metric} baseline diff wilcoxon p-value: {wilcoxon_p}")

    print()

corrector is_factual: 0.42 (baseline diff: 0.0)
corrector is_factual baseline diff mcnemar p-value: 1.0
corrector is_factual baseline diff wilcoxon p-value: 1.0
corrector has_extrinsic_and_fully_factual: 0.45 (baseline diff: 0.010000000000000009)
corrector has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 1.0
corrector has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.7054569861112734

pinocchio is_factual: 0.43 (baseline diff: 0.010000000000000009)
pinocchio is_factual baseline diff mcnemar p-value: 1.0
pinocchio is_factual baseline diff wilcoxon p-value: 0.5637028616507731
pinocchio has_extrinsic_and_fully_factual: 0.43 (baseline diff: -0.010000000000000009)
pinocchio has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 1.0
pinocchio has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.5637028616507731

rl-fact is_factual: 0.55 (baseline diff: 0.13000000000000006)
rl-fact is_factual baseline diff mcnemar p-value: 0.004425048828125
r



## BART 125

In [15]:
import pandas as pd
import json
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import wilcoxon

with open("results/evaluation/bart-test-extrinsic-125-summaries.json", "r") as f:
    json_sums = json.load(f)
df = pd.DataFrame(json_sums.values(), index=list(json_sums.keys()))


def get_baseline_diff(df, model, metric):
    return pd.crosstab(index=df[f"baseline-bart_{metric}"], columns=df[f"{model}_{metric}"])

for model in ["corrector", "pinocchio", "rl-fact", "gef_classifier", "gef_oracle"]:

    for metric in ["is_factual", "has_extrinsic_and_fully_factual"]:
        diff_metric = get_baseline_diff(df, model, metric)
        baseline_mean = df[f"baseline-bart_{metric}"].mean()
        model_mean = df[f"{model}_{metric}"].mean()
        print(f"{model} {metric}: {model_mean} (baseline diff: {model_mean - baseline_mean})")
        print(f"{model} {metric} baseline diff mcnemar p-value: {mcnemar(diff_metric).pvalue}")
        wilcoxon_p = wilcoxon(
            df[f"baseline-bart_{metric}"].astype(int), 
            df[f"{model}_{metric}"].astype(int)
        ).pvalue
        print(f"{model} {metric} baseline diff wilcoxon p-value: {wilcoxon_p}")

    print()

corrector is_factual: 0.44 (baseline diff: 0.0)
corrector is_factual baseline diff mcnemar p-value: 1.0
corrector is_factual baseline diff wilcoxon p-value: 1.0
corrector has_extrinsic_and_fully_factual: 0.464 (baseline diff: 0.0)
corrector has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 1.0
corrector has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 1.0

pinocchio is_factual: 0.448 (baseline diff: 0.008000000000000007)
pinocchio is_factual baseline diff mcnemar p-value: 1.0
pinocchio is_factual baseline diff wilcoxon p-value: 0.5637028616507731
pinocchio has_extrinsic_and_fully_factual: 0.44 (baseline diff: -0.02400000000000002)
pinocchio has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 0.375
pinocchio has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.17971249487899976

rl-fact is_factual: 0.592 (baseline diff: 0.15199999999999997)
rl-fact is_factual baseline diff mcnemar p-value: 0.00015652179718017578
rl-fact is_factual base



In [2]:
import pandas as pd
import json
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import wilcoxon

with open("results/evaluation/pegasus-test-extrinsic-100-summaries.json", "r") as f:
    json_sums = json.load(f)
df = pd.DataFrame(json_sums.values(), index=list(json_sums.keys()))


def get_baseline_diff(df, model, metric):
    return pd.crosstab(index=df[f"baseline-pegasus_{metric}"], columns=df[f"{model}_{metric}"])

for model in ["gef_classifier", "gef_oracle"]:

    for metric in ["is_factual", "has_extrinsic_and_fully_factual"]:
        diff_metric = get_baseline_diff(df, model, metric)
        baseline_mean = df[f"baseline-pegasus_{metric}"].mean()
        model_mean = df[f"{model}_{metric}"].mean()
        print(f"{model} {metric}: {model_mean} (baseline diff: {model_mean - baseline_mean})")
        print(f"{model} {metric} baseline diff mcnemar p-value: {mcnemar(diff_metric).pvalue}")
        wilcoxon_p = wilcoxon(
            df[f"baseline-pegasus_{metric}"].astype(int), 
            df[f"{model}_{metric}"].astype(int)
        ).pvalue
        print(f"{model} {metric} baseline diff wilcoxon p-value: {wilcoxon_p}")

    print()

gef_classifier is_factual: 0.65 (baseline diff: 0.07000000000000006)
gef_classifier is_factual baseline diff mcnemar p-value: 0.11846923828124999
gef_classifier is_factual baseline diff wilcoxon p-value: 0.07070114486598297
gef_classifier has_extrinsic_and_fully_factual: 0.54 (baseline diff: -0.06999999999999995)
gef_classifier has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 0.18924713134765625
gef_classifier has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.12663045794761718

gef_oracle is_factual: 0.79 (baseline diff: 0.21000000000000008)
gef_oracle is_factual baseline diff mcnemar p-value: 9.5367431640625e-07
gef_oracle is_factual baseline diff wilcoxon p-value: 4.592833711753968e-06
gef_oracle has_extrinsic_and_fully_factual: 0.74 (baseline diff: 0.13)
gef_oracle has_extrinsic_and_fully_factual baseline diff mcnemar p-value: 0.002349853515625
gef_oracle has_extrinsic_and_fully_factual baseline diff wilcoxon p-value: 0.0016162222150599857

