In [33]:
import numpy as np
from scipy.stats import ttest_ind

def calculate_p_stat(score_name, set1_scores, set2_scores):
    if len(set1_scores) != len(set2_scores):
        print("Error: different number of scores in the two sets")
        exit()

    t_stat, p_value = ttest_ind(set1_scores, set2_scores)

    print("Stats for {} Scores:".format(score_name))
    print("T-test statistic: ", t_stat)
    print("P-value: ", p_value)

def percent_improvement(score_name, improved_scores, original_scores):
    percent_improvement = []
    for original, improved in zip(original_scores, improved_scores):
        if original == 0:
            continue
        percent_improvement.append(((improved - original) / original) * 100)
    aggregate_percent_improvement = sum(percent_improvement) / len(percent_improvement)

    print("Stats for {} Scores:".format(score_name))
    print("Percentage improvement for each example:", percent_improvement)
    print("Aggregate percentage improvement:", aggregate_percent_improvement)


In [35]:
# Jaccard Similarity Scores
set1_scores = np.array([0.143, 0.101, 0.0802, 0.08, 0.08, 0.0738, 0.072, 0.069, 0.064, 0.0432, 0.007])
set2_scores = np.array([0.1, 0.09, 0.054, 0.086, 0.057, 0.1, 0.056, 0.056, 0.045, 0.0482, 0.035])
# calculate_p_stat("Jaccard Similarity", set1_scores, set2_scores)
percent_improvement("Jaccard Similarity", set1_scores, set2_scores)
print("----------------------------------")
# BLEU Similarity Scores
set1_scores = np.array([0.0738, 0.02889605415, 0.006961570131, 0.004569788283, 0.004489054433, 0.003934213392, 0.003120157128, 0.002032545482, 0.001551112365, 0.0008929035655, 0])
set2_scores = np.array([0, 0.02967740419, 0.006961570131, 0.003992077756, 0.004489054433, 0.003934213392, 0.003120157128, 0.002032545482, 0.001551112365, 0.0008929035655, 0.035])
# calculate_p_stat("BLEU", set1_scores, set2_scores)
percent_improvement("BLEU", set1_scores, set2_scores)
print("----------------------------------")
# ROUGE-1 Similarity Scores
set1_scores = np.array([0.3853210959, 0.2321428521, 0.2314049537, 0.1935483823, 0.1818181768, 0.1797752762, 0.1764705833, 0.1739130398, 0.1676646662, 0.1442307653, 0.07999999609])
set2_scores = np.array([0.2809917306, 0.2096774144, 0.1954887168, 0.1904761859, 0.1803278639, 0.1584158371, 0.1929824513, 0.1652892522, 0.1899441294, 0.1454545412, 0.09195401945])
# calculate_p_stat("ROUGE-1", set1_scores, set2_scores)
percent_improvement("ROUGE-1", set1_scores, set2_scores)
print("----------------------------------")
# ROUGE-2 Similarity Scores
set1_scores = np.array([0.1690140795, 0.04878048308, 0.04545454055, 0.03804347498, 0.02840908752, 0.02531645134, 0.02484471551, 0.01739129987, 0.01481480989, 0, 0])
set2_scores = np.array([0.06024095907, 0.06802720653, 0.039999995, 0.01020407783, 0.01063829397, 0.02298850102, 0.04324323828, 0.01438848517, 0.01257861168, 0.02380951899, 0])
# calculate_p_stat("ROUGE-2", set1_scores, set2_scores)
percent_improvement("ROUGE-2", set1_scores, set2_scores)

print("----------------------------------")
# ROUGE-L Similarity Scores (based on LCS)
set1_scores = np.array([0.3669724721, 0.1935483823, 0.1818181769, 0.1739130398, 0.1346153807, 0.1676646662, 0.1964285664, 0.1573033661, 0.137254897, 0.1454545405, 0.07999999609])
set2_scores = np.array([0.1983471025, 0.1904761859, 0.1804511228, 0.1652892522, 0.09999999576, 0.1564245763, 0.1612903176, 0.1584158371, 0.1403508723, 0.1803278639, 0.09195401945])
# calculate_p_stat("ROUGE-L", set1_scores, set2_scores)
percent_improvement("ROUGE-L", set1_scores, set2_scores)
print("----------------------------------")
# Sentence Similarity Scores
set1_scores = np.array([0.8065617681, 0.8659570217, 0.6604626775, 0.796494782, 0.8181899786, 0.7874866724, 0.721357584, 0.8470582366, 0.7084579468, 0.8093848228, 0.6984382868])
set2_scores = np.array([0.7054891586, 0.8086572886, 0.7588876486, 0.8532322049, 0.940956831, 0.5509544611, 0.9234000444, 0.760665834, 0.6288704872, 0.649985671, 0.6206271052])
# calculate_p_stat("ROUGE-L", set1_scores, set2_scores)
percent_improvement("mpnet scores", set1_scores, set2_scores)
print("----------------------------------")

Stats for Jaccard Similarity Scores:
Percentage improvement for each example: [42.999999999999986, 12.222222222222232, 48.51851851851851, -6.976744186046502, 40.35087719298245, -26.200000000000003, 28.57142857142856, 23.214285714285722, 42.222222222222236, -10.373443983402485, -80.0]
Aggregate percentage improvement: 10.413578752019156
----------------------------------
Stats for BLEU Scores:
Percentage improvement for each example: [-2.632811262729242, 0.0, 14.471424714403797, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -100.0]
Aggregate percentage improvement: -8.816138654832546
----------------------------------
Stats for ROUGE-1 Scores:
Percentage improvement for each example: [37.12898065620157, 10.7142859254945, 18.372537038413878, 1.6129031487499859, 0.8264462672426812, 13.483146313532302, -8.556149996421974, 5.217391624208713, -11.729482385360834, -0.8413459558593702, -13.000000904256282]
Aggregate percentage improvement: 4.8389737938131985
----------------------------------
Stats for ROUGE-