In [3]:
from scipy.stats import norm

def calculate_p_value(p1, p2, n):
    SE = ((p1*(1-p1)/n) + (p2*(1-p2)/n))**0.5
    z = (p2-p1)/SE
    p = 2*(1-norm.cdf(abs(z)))
    return p

In [10]:
calculate_p_value(0.9291, 0.9350, 1015)

np.float64(0.5973713819971624)

# Wilcoxon

In [28]:
task = "PMB"
prop = "1"
prop_to_prob ={"1": "100%", "0.5": "50%", "0.1": "10%", "0.25": "25%"}
model_to_compare = "BERT_Encoder"
lang = "it"

In [2]:
import os
import numpy as np
# go to parent directory
os.chdir("..")

In [29]:
# open the files
TI_preds = {"Run1": [], "Run2": [], "Run3": []}
M2C_preds = {"Run1": [], "Run2": [], "Run3": []}
TI_accs = {"Run1": [], "Run2": [], "Run3": []}
M2C_accs = {"Run1": [], "Run2": [], "Run3": []}

runs = ["Run1", "Run2", "Run3"]



for run in runs:
    TI_file = f"predictions/{task}/predictions_TagInsert_{lang}_{prop}_{run}.csv"
    m2c_file = f"predictions/{task}/predictions_{model_to_compare}_{lang}_{prop}_{run}.csv"
    with open(TI_file) as f:
        # get number of lines in the file
        rows = sum(1 for line in f)
    with open(TI_file) as f:
        
        for i, line in enumerate(f):
            sentence = []
            preds = line.strip().split(", ")
            # remove the last comma
            # preds[-1] = preds[-1][:-1]
            for j, pred in enumerate(preds):
                if j == len(preds) - 1 and not pred.startswith("Sentence"):
                    # remove the last comma
                    pred = pred[:-1]
                if pred.startswith("Sentence"):
                    TI_accs[run].append(pred.split(": ")[1])
                elif i != rows - 1:
                    word, gold, prediction, order = pred.split("|")
                    sentence.append((word, gold, prediction, order))
            if len(sentence) != 0:
                TI_preds[run].append(sentence)

    with open(m2c_file) as f:
        for i, line in enumerate(f):
            sentence = []
            preds = line.strip().split(", ")
            # remove the last comma
            # preds[-1] = preds[-1][:-1]
            for j, pred in enumerate(preds):
                if j == len(preds) - 1 and not pred.startswith("Sentence") and model_to_compare != "BERT_Encoder":
                    # remove the last comma
                    pred = pred[:-1]
                if pred.startswith("Sentence"):
                    M2C_accs[run].append(pred.split(": ")[1])
                elif i != rows - 1:
                    word, gold, prediction = pred.split("|")
                    sentence.append((word, gold, prediction))
            if len(sentence) != 0:
                M2C_preds[run].append(sentence)

# convert all the accuracies to floats
for run in runs:
    TI_accs[run] = [float(acc) for acc in TI_accs[run]]
    M2C_accs[run] = [float(acc) for acc in M2C_accs[run]]

In [30]:
for run in runs:
    correct_ti = 0
    correct_m2c = 0
    total = 0
    for ti_preds, m2c_preds, ti_accs, m2c_accs in zip(TI_preds[run], M2C_preds[run], TI_accs[run], M2C_accs[run]):
        sentence_len = len(ti_preds)
        total += sentence_len
        for i, (ti, m2c) in enumerate(zip(ti_preds, m2c_preds)):
            ti_word, gold, ti_pred, ti_order = ti
            m2c_word, _, m2c_pred = m2c
            if ti_pred == gold:
                correct_ti += 1
            if m2c_pred == gold:
                correct_m2c += 1

    print(correct_ti, correct_m2c, total, correct_ti / total, correct_m2c / total)

949 944 1015 0.9349753694581281 0.9300492610837439
938 943 1015 0.9241379310344827 0.929064039408867
945 944 1015 0.9310344827586207 0.9300492610837439


In [31]:
from scipy.stats import wilcoxon

TI_runs = np.zeros((total, len(runs)), dtype=int)
M2C_runs = np.zeros((total, len(runs)), dtype=int)
for run in runs:
    run_id = 0
    for ti_preds, m2c_preds, ti_accs, m2c_accs in zip(TI_preds[run], M2C_preds[run], TI_accs[run], M2C_accs[run]):
        for i, (ti, m2c) in enumerate(zip(ti_preds, m2c_preds)):
            ti_word, gold, ti_pred, ti_order = ti
            m2c_word, _, m2c_pred = m2c
            if ti_pred == gold:
                TI_runs[run_id, runs.index(run)] = 1
            else:
                TI_runs[run_id, runs.index(run)] = 0
            if m2c_pred == gold:
                M2C_runs[run_id, runs.index(run)] = 1
            else:
                M2C_runs[run_id, runs.index(run)] = 0
            run_id += 1
            

# Step 1: Average accuracy per test instance across the 3 runs
model_a_mean = TI_runs.mean(axis=1)
model_b_mean = M2C_runs.mean(axis=1)

# Step 2: Compute per-sample difference
diffs = model_a_mean - model_b_mean

# Step 3: Run Wilcoxon signed-rank test
stat, p_value = wilcoxon(diffs)

print(f"Wilcoxon statistic: {stat}")
print(f"P-value: {p_value}")

Wilcoxon statistic: 1882.0
P-value: 0.3108048407590046


# Permutation Test

In [32]:
from sklearn.utils import shuffle

def permutation_test_multiple_runs(y_true, y_preds_1, y_preds_2, n_iterations=1000):
    """
    Perform a permutation test to compare the accuracy of two models over multiple runs on the same dataset.
    
    Args:
    - y_true (array-like): The true labels (ground truth) for the test set.
    - y_preds_1 (array-like): A list of arrays where each array contains the predictions of model 1 for each run.
    - y_preds_2 (array-like): A list of arrays where each array contains the predictions of model 2 for each run.
    - n_iterations (int): The number of permutations to perform (default is 1000).
    
    Returns:
    - observed_diff (float): The observed difference in accuracy between the two models.
    - p_value (float): The p-value of the permutation test, representing the probability
      of observing a difference as extreme as the observed one under the null hypothesis.
    """
    
    # Calculate the accuracy of each model over all runs
    accuracies_1 = [np.mean(y_pred == y_true) for y_pred in y_preds_1]
    accuracies_2 = [np.mean(y_pred == y_true) for y_pred in y_preds_2]
    
    # Calculate the observed difference in accuracy between the two models
    observed_acc_1 = np.mean(accuracies_1)  # Mean accuracy of model 1 across all runs
    observed_acc_2 = np.mean(accuracies_2)  # Mean accuracy of model 2 across all runs
    observed_diff = observed_acc_1 - observed_acc_2  # The observed difference in accuracy
    
    count = 0  # To track how many times the difference from permutation is as extreme as the observed difference
    
    for _ in range(n_iterations):
        # Shuffle the true labels randomly
        y_true_shuffled = shuffle(y_true)
        
        # Calculate the accuracy for both models on the shuffled labels for each run
        accuracies_1_shuffled = [np.mean(y_pred == y_true_shuffled) for y_pred in y_preds_1]
        accuracies_2_shuffled = [np.mean(y_pred == y_true_shuffled) for y_pred in y_preds_2]
        
        # Calculate the difference in accuracy after shuffling
        diff_shuffled = np.mean(accuracies_1_shuffled) - np.mean(accuracies_2_shuffled)
        
        # If the shuffled difference is as extreme as or more extreme than the observed difference, count it
        if np.abs(diff_shuffled) >= np.abs(observed_diff):
            count += 1
    # Calculate the p-value as the proportion of times the shuffled difference was as extreme as the observed difference
    p_value = count / n_iterations
    
    return observed_diff, p_value

In [33]:
golds = []
TI_labs = [[] for _ in range(len(runs))]
M2C_labs = [[] for _ in range(len(runs))]
for run in runs:
    for ti_preds, m2c_preds, ti_accs, m2c_accs in zip(TI_preds[run], M2C_preds[run], TI_accs[run], M2C_accs[run]):
        for i, (ti, m2c) in enumerate(zip(ti_preds, m2c_preds)):
            ti_word, gold, ti_pred, ti_order = ti
            m2c_word, _, m2c_pred = m2c
            if run == "Run1":
                golds.append(gold)
            TI_labs[runs.index(run)].append(ti_pred)
            M2C_labs[runs.index(run)].append(m2c_pred)
golds = np.array(golds)
for run in runs:
    TI_labs[runs.index(run)] = np.array(TI_labs[runs.index(run)])
    M2C_labs[runs.index(run)] = np.array(M2C_labs[runs.index(run)])

In [35]:
observed_diff, p_value = permutation_test_multiple_runs(golds, TI_labs, M2C_labs, n_iterations=1000)

# Print results
print(f"Observed difference in accuracy: {observed_diff:.4f}")
print(f"P-value: {p_value:.4f}")

Observed difference in accuracy: 0.0003
P-value: 0.7080


In [8]:
def permutation_test(y_true, y_pred_1, y_pred_2, n_iterations=1000):
    """
    Perform a permutation test to compare the accuracy of two models on the same dataset.

    Args:
    - y_true (array-like): The true labels (ground truth) for the test set.
    - y_pred_1 (array-like): The predictions made by the first model.
    - y_pred_2 (array-like): The predictions made by the second model.
    - n_iterations (int): The number of permutations to perform (default is 1000).

    Returns:
    - observed_diff (float): The observed difference in accuracy between the two models.
    - p_value (float): The p-value of the permutation test, representing the probability
      of observing a difference as extreme as the observed one under the null hypothesis.
    """
    
    # Calculate the observed difference in accuracy between the two models
    observed_acc_1 = np.mean(y_pred_1 == y_true)  # Accuracy of model 1
    observed_acc_2 = np.mean(y_pred_2 == y_true)  # Accuracy of model 2
    observed_diff = observed_acc_1 - observed_acc_2  # The observed difference in accuracy
    
    count = 0  # To track how many times the difference from permutation is as extreme as the observed difference
    
    for _ in range(n_iterations):
        # Shuffle the true labels randomly
        y_true_shuffled = shuffle(y_true)
        
        # Calculate the accuracy for both models on the shuffled labels
        acc_1_shuffled = np.mean(y_pred_1 == y_true_shuffled)
        acc_2_shuffled = np.mean(y_pred_2 == y_true_shuffled)
        
        # Calculate the difference in accuracy after shuffling
        diff_shuffled = acc_1_shuffled - acc_2_shuffled
        
        # If the shuffled difference is as extreme as or more extreme than the observed difference, count it
        if np.abs(diff_shuffled) >= np.abs(observed_diff):
            count += 1
    
    # Calculate the p-value as the proportion of times the shuffled difference was as extreme as the observed difference
    p_value = count / n_iterations
    
    return observed_diff, p_value

In [None]:
for i in range(len(runs)):
    observed_diff, p_value = permutation_test(golds, TI_labs[i], M2C_labs[i], n_iterations=1000)

    # Print results
    print(f"Observed difference in accuracy: {observed_diff:.4f}")
    print(f"P-value: {p_value:.4f}")

Observed difference in accuracy: 0.0049
P-value: 0.0119
Observed difference in accuracy: -0.0049
P-value: 0.0203
Observed difference in accuracy: 0.0010
P-value: 0.7295


# Long Tail Testing

In [42]:
import json, torch

if task == "PMB":
    tgt_to_idx = json.load(open(f"data/{task}/{lang}/processed/{task}_to_idx.json"))
    idx_to_tgt = json.load(open(f"data/{task}/{lang}/processed/idx_to_{task}.json"))
    with open(f"data/{task}/{lang}/processed/word_to_idx.json", 'r', encoding='utf-8') as f:
        word_to_idx = json.load(f)
    with open(f"data/{task}/{lang}/processed/idx_to_word.json", 'r', encoding='utf-8') as f:
        idx_to_word = json.load(f)
    train_path = f"data/{task}/{lang}/processed/train_data.pth"
    val_path = f"data/{task}/{lang}/processed/val_data.pth"
    test_path = f"data/{task}/{lang}/processed/test_data.pth"
else:
    tgt_to_idx = json.load(open(f"data/{task}/processed/{prop_to_prob[prop]}/{task}_to_idx.json"))
    idx_to_tgt = json.load(open(f"data/{task}/processed/{prop_to_prob[prop]}/idx_to_{task}.json"))
    word_to_idx = json.load(open(f"data/{task}/processed/{prop_to_prob[prop]}/word_to_idx.json"))
    idx_to_word = json.load(open(f"data/{task}/processed/{prop_to_prob[prop]}/idx_to_word.json"))
    train_path = f"data/{task}/processed/{prop_to_prob[prop]}/train_data.pth"
    val_path = f"data/{task}/processed/{prop_to_prob[prop]}/val_data.pth"
    test_path = f"data/{task}/processed/{prop_to_prob[prop]}/test_data.pth"
train_data = torch.load(train_path)
val_data = torch.load(val_path)
test_data = torch.load(test_path)

In [43]:
from collections import Counter

word_freqs = Counter()
tag_freqs = Counter()

for sentence in train_data['words']:
    for word in sentence:
        if word != word_to_idx['<PAD>']:
            word_string = idx_to_word[str(word)]
            word_freqs[word_string] += 1

for sentence in train_data['tags']:
    for tag in sentence:
        if tag != tgt_to_idx['<PAD>'] and tag != tgt_to_idx['<START>']:
            tag_string = idx_to_tgt[str(tag)]
            tag_freqs[tag_string] += 1

frequent_tags = [tag for tag, freq in tag_freqs.items() if freq >= 100]
common_tags = [tag for tag, freq in tag_freqs.items() if freq >= 10 and freq < 100]
rare_tags = [tag for tag, freq in tag_freqs.items() if freq < 10]

len(frequent_tags), len(common_tags), len(rare_tags)

(13, 36, 88)

In [44]:
golds_frequent = []
TI_labs_frequent = [[] for _ in range(len(runs))]
M2C_labs_frequent = [[] for _ in range(len(runs))]
golds_common = []
TI_labs_common = [[] for _ in range(len(runs))]
M2C_labs_common = [[] for _ in range(len(runs))]
golds_rare = []
TI_labs_rare = [[] for _ in range(len(runs))]
M2C_labs_rare = [[] for _ in range(len(runs))]

for run in runs:
    for ti_preds, m2c_preds, ti_accs, m2c_accs in zip(TI_preds[run], M2C_preds[run], TI_accs[run], M2C_accs[run]):
        for i, (ti, m2c) in enumerate(zip(ti_preds, m2c_preds)):
            ti_word, gold, ti_pred, ti_order = ti
            m2c_word, _, m2c_pred = m2c
            if run == "Run1":
                if gold in frequent_tags:
                    golds_frequent.append(gold)
                elif gold in common_tags:
                    golds_common.append(gold)
                elif gold in rare_tags:
                    golds_rare.append(gold)
            if gold in frequent_tags:
                TI_labs_frequent[runs.index(run)].append(ti_pred)
                M2C_labs_frequent[runs.index(run)].append(m2c_pred)
            elif gold in common_tags:
                TI_labs_common[runs.index(run)].append(ti_pred)
                M2C_labs_common[runs.index(run)].append(m2c_pred)
            elif gold in rare_tags:
                TI_labs_rare[runs.index(run)].append(ti_pred)
                M2C_labs_rare[runs.index(run)].append(m2c_pred)
golds_frequent = np.array(golds_frequent)
for run in runs:
    TI_labs_frequent[runs.index(run)] = np.array(TI_labs_frequent[runs.index(run)])
    M2C_labs_frequent[runs.index(run)] = np.array(M2C_labs_frequent[runs.index(run)])
golds_common = np.array(golds_common)
for run in runs:
    TI_labs_common[runs.index(run)] = np.array(TI_labs_common[runs.index(run)])
    M2C_labs_common[runs.index(run)] = np.array(M2C_labs_common[runs.index(run)])
golds_rare = np.array(golds_rare)
for run in runs:
    TI_labs_rare[runs.index(run)] = np.array(TI_labs_rare[runs.index(run)])
    M2C_labs_rare[runs.index(run)] = np.array(M2C_labs_rare[runs.index(run)])
    
            

In [45]:
for i in range(len(runs)):
    for freq in ["Frequent", "Common", "Rare"]:
        if freq == "Frequent":
            observed_diff, p_value = permutation_test(golds_frequent, TI_labs_frequent[i], M2C_labs_frequent[i], n_iterations=1000)
        elif freq == "Common":
            observed_diff, p_value = permutation_test(golds_common, TI_labs_common[i], M2C_labs_common[i], n_iterations=1000)
        elif freq == "Rare":
            observed_diff, p_value = permutation_test(golds_rare, TI_labs_rare[i], M2C_labs_rare[i], n_iterations=1000)

        # Print results
        print(f"Observed difference in accuracy for {freq} tags in {runs[i]}: {observed_diff:.4f}")
        print(f"P-value: {p_value:.4f}")

Observed difference in accuracy for Frequent tags in Run1: -0.0073
P-value: 0.0000
Observed difference in accuracy for Common tags in Run1: -0.0130
P-value: 0.1270
Observed difference in accuracy for Rare tags in Run1: 0.3333
P-value: 0.0000
Observed difference in accuracy for Frequent tags in Run2: -0.0233
P-value: 0.0000
Observed difference in accuracy for Common tags in Run2: 0.0000
P-value: 1.0000
Observed difference in accuracy for Rare tags in Run2: 0.3590
P-value: 0.0000
Observed difference in accuracy for Frequent tags in Run3: -0.0122
P-value: 0.0000
Observed difference in accuracy for Common tags in Run3: -0.0065
P-value: 0.7110
Observed difference in accuracy for Rare tags in Run3: 0.3077
P-value: 0.0000


In [27]:
for i, lab in enumerate(golds_rare):
    if lab == TI_labs_rare[0][i]:
        print(f"Correct prediction for rare tag and TI: {lab}")
    elif lab == M2C_labs_rare[0][i]:
        print(f"Correct prediction for rare tag and M2C: {lab}")

Correct prediction for rare tag and M2C: (S[dcl]\S[wq])\NP
Correct prediction for rare tag and M2C: S[as]/S[poss]
Correct prediction for rare tag and M2C: (S[dcl]\NP[thr])/(S[to]\NP)
Correct prediction for rare tag and M2C: ((S[dcl]\S[dcl])\NP)/PP
Correct prediction for rare tag and M2C: (S[dcl]\NP[thr])/(S[to]\NP)
Correct prediction for rare tag and M2C: (S[dcl]\S[qem])/PP
