In [None]:
# https://rajpurkar.github.io/SQuAD-explorer/
import re
import string
from collections import Counter

def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        return int(gold_toks == pred_toks), int(gold_toks == pred_toks), int(gold_toks == pred_toks)
    if num_same == 0:
        return 0, 0, 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, precision, recall

def read_answers(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return file.readlines()

def evaluate(gold_file, pred_file):
    gold_answers = read_answers(gold_file)
    predictions = read_answers(pred_file)
    f1, precision, recall, exact_match = 0.0, 0.0, 0.0, 0
    for gold, pred in zip(gold_answers, predictions):
        _f1, _precision, _recall = compute_f1(gold, pred)
        f1 += _f1
        precision += _precision
        recall += _recall
        exact_match += compute_exact(gold, pred)
    total = len(gold_answers)
    return {
        "F1": 100.0 * f1 / total,
        "Recall": 100.0 * recall / total,
        "Exact Match": 100.0 * exact_match / total
    }

from google.colab import drive
drive.mount('/content/drive')
predictions = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/Roberta_answer/roberta_model_answer.txt'
gold = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/t5_gold_answer.txt'
results1 = evaluate(gold, predictions)
print(results1)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'F1': 43.04146909184546, 'Recall': 42.227779229104456, 'Exact Match': 32.87671232876713}


In [None]:
from google.colab import drive
drive.mount('/content/drive')
predictions = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/model_answer_t5.txt'
gold = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/t5_gold_answer.txt'
results2 = evaluate(gold, predictions)
print(results2)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'F1': 3.159836282311732, 'Recall': 13.824470304012987, 'Exact Match': 0.0}


In [None]:
from google.colab import drive
drive.mount('/content/drive')
predictions = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/mistral_answer/mistral_answer.txt'
gold = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/t5_gold_answer.txt'
results3 = evaluate(gold, predictions)
print(results3)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'F1': 3.3183087214459523, 'Recall': 5.071700254925944, 'Exact Match': 0.684931506849315}


In [None]:
drive.mount('/content/drive')
roberta_ans = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/Roberta_answer/roberta_model_answer.txt'
t5_ans = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/model_answer_t5.txt'
mis_ans = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/mistral_answer/mistral_answer.txt'
gold = '/content/drive/My Drive/Colab Notebooks/NLP2/model_answer/t5_answer/t5_gold_answer.txt'

def evaluate_sig(gold_file, pred_file):
    gold_answers = read_answers(gold_file)
    predictions = read_answers(pred_file)
    scores_per_question = []
    for gold, pred in zip(gold_answers, predictions):
        f1, precision, recall = compute_f1(gold, pred)
        exact_match = compute_exact(gold, pred)
        scores_per_question.append({
            "F1": f1
        })
    return scores_per_question

roberta_scores = evaluate_sig(gold, roberta_ans)
t5_scores = evaluate_sig(gold, t5_ans)
mis_scores = evaluate_sig(gold, mis_ans)

f1_scores_model1 = [score['F1'] for score in roberta_scores]
f1_scores_model2 = [score['F1'] for score in t5_scores]
f1_scores_model3 = [score['F1'] for score in mis_scores]

from scipy import stats

# Function to conduct a paired t-test and print the results
def conduct_t_test(scores_model_a, scores_model_b, model_a_name, model_b_name):
    t_statistic, p_value = stats.ttest_rel(scores_model_a, scores_model_b)
    print(f"Paired T-Test between {model_a_name} and {model_b_name}:")
    print(f"  T-statistic: {t_statistic}, P-value: {p_value}")
    if p_value < 0.05:
        print("  The difference is statistically significant.\n")
    else:
        print("  No statistically significant difference.\n")

# Conduct paired t-tests between each pair of models
conduct_t_test(f1_scores_model1, f1_scores_model2, 'Model 1', 'Model 2')
conduct_t_test(f1_scores_model1, f1_scores_model3, 'Model 1', 'Model 3')
conduct_t_test(f1_scores_model2, f1_scores_model3, 'Model 2', 'Model 3')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Paired T-Test between Model 1 and Model 2:
  T-statistic: 21.378974249623514, P-value: 2.6080521451742205e-75
  The difference is statistically significant.

Paired T-Test between Model 1 and Model 3:
  T-statistic: 20.872705989695085, P-value: 1.1297994852892257e-72
  The difference is statistically significant.

Paired T-Test between Model 2 and Model 3:
  T-statistic: -0.32033981060363065, P-value: 0.748825526567398
  No statistically significant difference.

