In [125]:
import pandas as pd
import numpy as np
import collections
import os
from statsmodels.stats.contingency_tables import mcnemar
from scipy import stats
from mlxtend.evaluate import mcnemar_table

## functions to calculate metrics

In [None]:
# Sources:
# https://www.deepset.ai/blog/metrics-to-evaluate-a-question-answering-system
# https://kierszbaumsamuel.medium.com/f1-score-in-nlp-span-based-qa-task-5b115a5e7d41

In [77]:
# Recall measures how many times the correct document was among the retrieved documents
# For a single query, the output is binary: either a document is contained in the selection, or it is not
def correct_retrieved_doc(row):
    source_cols = ["source_1", "source_2", "source_3", "source_4", "source_5"]
    for col in source_cols:
        if row[col] == row["source"]:
            return 1
    return 0

def recall(output):
    return sum(output.apply(correct_retrieved_doc)) / output.shape[0]

In [78]:
# Exact match measures the proportion of documents where the predicted answer is identical to the correct answer
def is_exact_match(row):
    if row["actual_answer"] == row["answer"]:
        return 1
    return 0

def exact_match(output):
    return sum(output.apply(is_exact_match)) / output.shape[0]

In [79]:
# F1 measures the word overlap between the labeled and the predicted answer
# tp: number of tokens* that are shared between the correct answer and the prediction.
# fp: number of tokens that are in the prediction but not in the correct answer.
# fn: number of tokens that are in the correct answer but not in the prediction.
def get_f1(row):
    real_answer = row["actual_answer"].split()
    gen_answer = row["answer"].split()
    common = collections.Counter(real_answer) & collections.Counter(gen_answer)
    num_same = sum(common.values())
    
    if len(real_answer) == 0 or len(gen_answer) == 0:
        return int(real_answer == gen_answer)
    
    if num_same == 0:
        return 0
    
    precision = 1.0 * num_same / len(gen_answer)
    recall = 1.0 * num_same / len(real_answer)
    f1 = (2 * precision * recall) / (precision + recall)
    
    return f1

def overall_f1(output):
    return sum(output.apply(get_f1)) / output.shape[0]

In [131]:
# statistical significance
# mcnemar test for recall and exact match
def stat_sig_mcnemar(baseline, variation):
    contingency_table = mcnemar_table(y_target=np.ones(len(baseline)), y_model1=baseline, y_model2=variation)
    return mcnemar(contingency_table).pvalue

# t test for f1
def stat_sig_t_test(baseline, variation):
    t_stat, p_val = stats.ttest_ind(baseline, variation)
    return p_val

## load results

In [21]:
non_rag_baseline = pd.read_csv("results/results_baseline.csv")
non_rag_baseline.drop(["source_1", "source_2", "source_3", "source_4", "source_5"], axis=1, inplace=True)
rag_baseline = pd.read_csv("results/results_rag_bad_embedding.csv")

In [31]:
variation1 = pd.read_csv("results/results_rag.csv")
variation2 = pd.read_csv("results/results_rag_few_shot_bad_embedding.csv")
variation3 = pd.read_csv("results/results_rag_few_shot.csv")

In [61]:
train_data = pd.read_csv("gemma3_questions_balanced_sample_200.csv")
train_data.drop(["source", "gemma3:12b_answer", "gemma3:12b_question"], axis=1, inplace=True)
train_data.columns = ["source", "question_type", "content_category", "actual_answer"]

In [None]:
# question, answer, source_1, source_2, source_3, source_4, source_5

In [63]:
# combine results with train data
non_rag_baseline = pd.concat([non_rag_baseline, train_data], axis=1)
rag_baseline = pd.concat([rag_baseline, train_data], axis=1)
variation1 = pd.concat([variation1, train_data], axis=1)
variation2 = pd.concat([variation2, train_data], axis=1)
variation3 = pd.concat([variation3, train_data], axis=1)

## calculate metrics

In [87]:
# get metrics for baseline models
non_rag_baseline["em"] = non_rag_baseline.apply(is_exact_match, axis=1)
non_rag_baseline["f1"] = non_rag_baseline.apply(get_f1, axis=1)

rag_baseline["recall"] = rag_baseline.apply(correct_retrieved_doc, axis=1)
rag_baseline["em"] = rag_baseline.apply(is_exact_match, axis=1)
rag_baseline["f1"] = rag_baseline.apply(get_f1, axis=1)

In [118]:
print("non rag metrics")
print(f'em: {non_rag_baseline["em"].mean()}')
print(f'f1: {non_rag_baseline["f1"].mean()}')
print("\n")

print("rag baseline metrics")
print(f'recall: {rag_baseline["recall"].mean()}')
print(f'em: {rag_baseline["em"].mean()}')
print(f'f1: {rag_baseline["f1"].mean()}')

non rag metrics
em: 0.0
f1: 0.05685775923885378


rag baseline metrics
recall: 0.41
em: 0.0
f1: 0.11131313260860196


In [132]:
# statistical test for em, f1 for non rag vs rag baseline
print("non rag vs rag baseline")
print("em:", stat_sig_mcnemar(non_rag_baseline["em"], rag_baseline["em"]))
print("f1:", stat_sig_t_test(non_rag_baseline["f1"], rag_baseline["f1"]))

non rag vs rag baseline
em: 1.0
f1: 2.5059085750187265e-06


In [133]:
variations = [variation1, variation2, variation3]
var_names = ["variation 1", "variation 2", "variation 3"]
metric_cols = ["recall", "em", "f1"]

for i in range(len(variations)):
    var = variations[i]
    # add columns with metrics
    var["recall"] = var.apply(correct_retrieved_doc, axis=1)
    var["em"] = var.apply(is_exact_match, axis=1)
    var["f1"] = var.apply(get_f1, axis=1)
    
    # print metrics
    print(f'{var_names[i]} metrics')
    print("recall:", var["recall"].mean())
    print("em:", var["em"].mean())
    print("f1:", var["f1"].mean())
    print("\n")
    
    # significance tests against rag baseline
    print(f'rag baseline vs {var_names[i]}')
    print("recall:", stat_sig_mcnemar(rag_baseline["recall"], var["recall"]))
    print("em:", stat_sig_mcnemar(rag_baseline["em"], var["em"]))
    print("f1:", stat_sig_t_test(rag_baseline["f1"], var["f1"]))
    print("\n")

variation 1 metrics
recall: 0.545
em: 0.0
f1: 0.12425176189147388


rag baseline vs variation 1
recall: 4.1934157934520044e-05
em: 1.0
f1: 0.3479308473811954


variation 2 metrics
recall: 0.41
em: 0.0
f1: 0.11131313260860196


rag baseline vs variation 2
recall: 1.0
em: 1.0
f1: 1.0


variation 3 metrics
recall: 0.545
em: 0.0
f1: 0.13067888956981413


rag baseline vs variation 3
recall: 4.1934157934520044e-05
em: 1.0
f1: 0.19635264626893023


