In [1]:
import json
import numpy as np
from sklearn import metrics


In [95]:
np.random.seed(1234)

# adapted from: https://github.com/RobertInjac/Master-thesis/
def bootstrap_significance_testing(y_true, y_predA, y_predB, metric, n=int(1e5)):
    """
    Perform bootstrap significance testing.
    
    Null hypothesis is: A is no better than B on the population as a whole.
    Alternative hypothesis: A is better than B on the population as a whole.
    
    The return value is the p-value for this test.
    The bootstrap estimates the p-value though a combination of simulation and approximation.
    
    A small p-value indicates strong evidence against the null hypothesis.
    In other words, it gives evidence that A is better than B.
    
    Explaination in detail (section 2.2. Boostrap):
    Berg-Kirkpatrick, Taylor, David Burkett, and Dan Klein. "An empirical investigation of statistical significance in nlp."
    Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning.
    Association for Computational Linguistics, 2012.
    
    :param y_true: 
    :param y_predA: predictions of model A
    :param y_predB: predictions of model B
    :param metric: used metric, has to be a function of form f(y_true, y_pred)
    :param n: integer; the number of times to perform bootstrap resampling
    """
    v1 = metric(y_true, y_predA)
    v2 = metric(y_true, y_predB)
    d = (v1 - v2) * 2 # A has to beat B by at least 2* delta(x)
    print('delta', d)
    s = 0 # number of times A beats B
    print('diff', d)
    print('macro A', v1)
    print('macro B', v2)
    l = len(y_true)
    for i in range(n):
        idx = np.random.choice(l, l, replace=True) # sample indices with replacement
        v1i = metric(y_true[idx], y_predA[idx])
        v2i = metric(y_true[idx], y_predB[idx])
        #print('f1 A', v1i)
        #print('f1 B', v2i)
        di = v1i - v2i
        if di > d:
            s += 1
    print('s',s)
    print('n',n)
    return s / n
    

In [96]:
def get_golds_preds(file_name, is_fine=True):
    file_suffix = "_test"
    file_suffix_fine = file_suffix + "_fine"
    json_gold_pred = "label_set_info (id/gold/pred)"
    json_gold_pred_fine = "id_gold_pred_fine"


    n_folds = 4

    json_files = []
    for fold in range(n_folds):
        if is_fine:
            json_file = file_name + str(fold) + file_suffix_fine
        else:
            json_file = file_name + str(fold) + file_suffix
        json_files.append(json_file)

    if is_fine:
        json_gold_pred = json_gold_pred_fine

    id_golds_preds = []    
    for json_file in json_files:
        with open(json_file, 'r') as json_data:
            id_golds_preds.extend(json.load(json_data)[json_gold_pred])

    golds = []
    preds = []
    for _, gold, pred in id_golds_preds:
        golds.append(gold)
        preds.append(pred)

    return np.array(golds), np.array(preds)

In [97]:
metric = lambda predA, predB: metrics.f1_score(predA, predB, average='macro')

In [98]:
base_r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_classification/1_input/metrics_roberta_classification_r_text_test.json_fold"

base_hr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_classification/2_input/metrics_roberta_classification_hit_order_r_text_test.json_fold"

base_rc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_classification/2_input/metrics_roberta_classification_r_text_gold_sentiments_coarse_num_test.json_fold"
base_cr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_classification/2_input/metrics_roberta_classification_gold_sentiments_coarse_num_r_text_test.json_fold"

base_r_golds, base_r_preds = get_golds_preds(base_r_file, is_fine=False)

base_hr_golds, base_hr_preds = get_golds_preds(base_hr_file, is_fine=False)

base_rc_golds, base_rc_preds = get_golds_preds(base_rc_file, is_fine=False)
base_cr_golds, base_cr_preds = get_golds_preds(base_cr_file, is_fine=False)

majority_preds = np.array([[1]*len(base_r_golds[0])]*len(base_r_golds))

### Sanity check w/ Majority

In [19]:
p_value = bootstrap_significance_testing(base_r_golds, base_r_preds, majority_preds, metric, n=int(1e4))
print("Majority vs. Base: Estimated p-value: ", p_value)

delta 0.3449333385880894
diff 0.3449333385880894
macro A 0.561065043749145
macro B 0.3885983744551003
s 0
n 10000
Majority vs. Base: Estimated p-value:  0.0


### 2-inputs w/ Coarse-grained sentiment

In [20]:
p_value = bootstrap_significance_testing(base_r_golds, base_rc_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RC: Estimated p-value: " + str(p_value))

delta 0.1001994301431477
diff 0.1001994301431477
macro A 0.6111647588207189
macro B 0.561065043749145
s 4
n 10000
Base vs. +RC: Estimated p-value: 0.0004


In [21]:
p_value = bootstrap_significance_testing(base_r_golds, base_cr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +CR: Estimated p-value: " + str(p_value))

delta 0.106811666705668
diff 0.106811666705668
macro A 0.614470877101979
macro B 0.561065043749145
s 4
n 10000
Base vs. +CR: Estimated p-value: 0.0004


### 2-inputs w/ Hit order

In [22]:
p_value = bootstrap_significance_testing(base_r_golds, base_hr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +HR: Estimated p-value: " + str(p_value))

delta 0.028323346557466378
diff 0.028323346557466378
macro A 0.5752267170278782
macro B 0.561065043749145
s 1302
n 10000
Base vs. +HR: Estimated p-value: 0.1302


## Hierarchical

In [109]:
hier_r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/1_input/metrics_roberta_hierarchical_r_text_test.json_fold"

hier_r_old_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/1_input/metrics_roberta_hierarchical_r_text_test.json_fold"
hier_cr_old_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_test.json_fold"
hier_rc_old_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_test.json_fold"
hier_tr_old_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_q_text_last_question_r_text_test.json_fold"
hier_rct_old_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_q_text_last_question_test.json_fold"

hier_rs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_gold_sentiments_num_test.json_fold"
hier_sr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_gold_sentiments_num_r_text_test.json_fold"
hier_rc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_test.json_fold"
hier_cr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_test.json_fold"

hier_tr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_last_question_r_text_test.json_fold"
hier_rt_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_last_question_r_text_test.json_fold"


hier_fr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_first_question_and_rest_r_text_test.json_fold"
hier_rf_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_text_first_question_and_rest_test.json_fold"

hier_er_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_last_question_and_rest_r_text_test.json_fold"
hier_re_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_text_last_question_and_rest_test.json_fold"

hier_ar_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_all_questions_r_text_test.json_fold"
hier_ra_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_text_all_questions_test.json_fold"

hier_2r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_last_2_sents_r_text_test.json_fold"
hier_r2_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_text_last_2_sents_test.json_fold"

hier_3r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_text_last_3_sents_r_text_test.json_fold"
hier_r3_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_text_last_3_sents_test.json_fold"

hier_hr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_hit_order_r_text_test.json_fold"


hier_rk_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_speaker_test.json_fold"
hier_kr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_q_speaker_r_text_test.json_fold"

hier_rp_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_speaker_party_test.json_fold"

hier_rl_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_q_speaker_role_test.json_fold"

hier_qr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_gold_q_sentiments_coarse_num_r_text_test.json_fold"
hier_rq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_gold_q_sentiments_coarse_num_test.json_fold"

hier_ri_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_r_text_gold_q_intents_num_test.json_fold"
hier_ir_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/2_input/metrics_roberta_hierarchical_gold_q_intents_num_r_text_test.json_fold"

hier_rcf_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_q_text_first_question_and_rest_test.json_fold"
hier_frc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_q_text_first_question_and_rest_r_text_gold_sentiments_coarse_num_test.json_fold"

hier_crq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_gold_q_sentiments_coarse_num_test.json_fold"
hier_cqr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_gold_q_sentiments_coarse_num_r_text_test.json_fold"
hier_rcq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_gold_q_sentiments_coarse_num_test.json_fold"
hier_rqc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_r_text_gold_q_sentiments_coarse_num_gold_sentiments_coarse_num_test.json_fold"
hier_qcr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_q_sentiments_coarse_num_gold_sentiments_coarse_num_r_text_test.json_fold"
hier_qrc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_q_sentiments_coarse_num_r_text_gold_sentiments_coarse_num_test.json_fold"


hier_chr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_hit_order_r_text_test.json_fold"
hier_crh_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_hit_order_test.json_fold"
hier_rch_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_hit_order_test.json_fold"

hier_rqhc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results_final/roberta_hierarchical/4_input/metrics_roberta_hierarchical_r_text_gold_q_sentiments_coarse_num_hit_order_gold_sentiments_coarse_num_test.json_fold"


hier_r_golds, hier_r_preds = get_golds_preds(hier_r_file, is_fine=True)

hier_rc_golds, hier_rc_preds = get_golds_preds(hier_rc_file, is_fine=True)
hier_cr_golds, hier_cr_preds = get_golds_preds(hier_cr_file, is_fine=True)

hier_rs_golds, hier_rs_preds = get_golds_preds(hier_rs_file, is_fine=True)
hier_sr_golds, hier_sr_preds = get_golds_preds(hier_sr_file, is_fine=True)

hier_tr_golds, hier_tr_preds = get_golds_preds(hier_tr_file, is_fine=True)
hier_rt_golds, hier_rt_preds = get_golds_preds(hier_tr_file, is_fine=True)

hier_fr_golds, hier_fr_preds = get_golds_preds(hier_fr_file, is_fine=True)
hier_rf_golds, hier_rf_preds = get_golds_preds(hier_rf_file, is_fine=True)

hier_er_golds, hier_er_preds = get_golds_preds(hier_er_file, is_fine=True)
hier_re_golds, hier_re_preds = get_golds_preds(hier_re_file, is_fine=True)

hier_ar_golds, hier_ar_preds = get_golds_preds(hier_ar_file, is_fine=True)
hier_ra_golds, hier_ra_preds = get_golds_preds(hier_ra_file, is_fine=True)

hier_2r_golds, hier_2r_preds = get_golds_preds(hier_2r_file, is_fine=True)
hier_r2_golds, hier_r2_preds = get_golds_preds(hier_r2_file, is_fine=True)

hier_3r_golds, hier_3r_preds = get_golds_preds(hier_3r_file, is_fine=True)
hier_r3_golds, hier_r3_preds = get_golds_preds(hier_r3_file, is_fine=True)

hier_hr_golds, hier_hr_preds = get_golds_preds(hier_hr_file, is_fine=True)

hier_fr_golds, hier_fr_preds = get_golds_preds(hier_fr_file, is_fine=True)

hier_rk_golds, hier_rk_preds = get_golds_preds(hier_rk_file, is_fine=True)
hier_kr_golds, hier_kr_preds = get_golds_preds(hier_kr_file, is_fine=True)
hier_rp_golds, hier_rp_preds = get_golds_preds(hier_rp_file, is_fine=True)
hier_rl_golds, hier_rl_preds = get_golds_preds(hier_rl_file, is_fine=True)

hier_ir_golds, hier_ir_preds = get_golds_preds(hier_ir_file, is_fine=True)
hier_ri_golds, hier_ri_preds = get_golds_preds(hier_ri_file, is_fine=True)

hier_qr_golds, hier_qr_preds = get_golds_preds(hier_qr_file, is_fine=True)
hier_rq_golds, hier_rq_preds = get_golds_preds(hier_rq_file, is_fine=True)

hier_rcf_golds, hier_rcf_preds = get_golds_preds(hier_rcf_file, is_fine=True)
hier_frc_golds, hier_frc_preds = get_golds_preds(hier_frc_file, is_fine=True)

hier_crq_golds, hier_crq_preds = get_golds_preds(hier_crq_file, is_fine=True)
hier_cqr_golds, hier_cqr_preds = get_golds_preds(hier_cqr_file, is_fine=True)
hier_rcq_golds, hier_rcq_preds = get_golds_preds(hier_rcq_file, is_fine=True)
hier_rqc_golds, hier_rqc_preds = get_golds_preds(hier_rqc_file, is_fine=True)
hier_qcr_golds, hier_qcr_preds = get_golds_preds(hier_qcr_file, is_fine=True)
hier_qrc_golds, hier_qrc_preds = get_golds_preds(hier_qrc_file, is_fine=True)


hier_chr_golds, hier_chr_preds = get_golds_preds(hier_chr_file, is_fine=True)
hier_crh_golds, hier_crh_preds = get_golds_preds(hier_crh_file, is_fine=True)
hier_rch_golds, hier_rch_preds = get_golds_preds(hier_rch_file, is_fine=True)

hier_rqhc_golds, hier_rqhc_preds = get_golds_preds(hier_rqhc_file, is_fine=True)

hier_r_old_golds, hier_r_old_preds = get_golds_preds(hier_r_old_file, is_fine=True)
hier_cr_old_golds, hier_cr_old_preds = get_golds_preds(hier_cr_old_file, is_fine=True)
hier_rc_old_golds, hier_rc_old_preds = get_golds_preds(hier_rc_old_file, is_fine=True)
hier_tr_old_golds, hier_tr_old_preds = get_golds_preds(hier_tr_old_file, is_fine=True)
hier_rct_old_golds, hier_rct_old_preds = get_golds_preds(hier_rct_old_file, is_fine=True)

In [81]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_r_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. Hier: Estimated p-value: " + str(p_value))

delta 0.020039894669472202
diff 0.020039894669472202
macro A 0.5710849910838811
macro B 0.561065043749145
s 2470
n 10000
Base vs. Hier: Estimated p-value: 0.247


### Old model

In [101]:
p_value = bootstrap_significance_testing(hier_r_old_golds, hier_cr_old_preds, hier_r_old_preds, metric, n=int(1e4))
print("Hier vs. +CR: Estimated p-value: ", p_value)

p_value = bootstrap_significance_testing(hier_r_old_golds, hier_tr_old_preds, hier_r_old_preds, metric, n=int(1e4))
print("Hier vs. +LastQR: Estimated p-value: ", p_value)

p_value = bootstrap_significance_testing(hier_r_old_golds, hier_rct_old_preds, hier_r_old_preds, metric, n=int(1e4))
print("Hier vs. +RCLastQ: Estimated p-value: ", p_value)

delta 0.046093668920515585
diff 0.046093668920515585
macro A 0.5927406997759949
macro B 0.5696938653157371
s 620
n 10000
Hier vs. +CR: Estimated p-value:  0.062
delta 0.03726209233119726
diff 0.03726209233119726
macro A 0.5883249114813357
macro B 0.5696938653157371
s 1148
n 10000
Hier vs. +LastQR: Estimated p-value:  0.1148
delta 0.06825419368626351
diff 0.06825419368626351
macro A 0.6038209621588688
macro B 0.5696938653157371
s 162
n 10000
Hier vs. +RCLastQ: Estimated p-value:  0.0162


In [103]:
p_value = bootstrap_significance_testing(hier_r_old_golds, hier_rct_old_preds, hier_cr_old_preds, metric, n=int(1e4))
print("+CR vs. +RCLastQ: Estimated p-value: ", p_value)

delta 0.022160524765747924
diff 0.022160524765747924
macro A 0.6038209621588688
macro B 0.5927406997759949
s 2660
n 10000
+CR vs. +RCLastQ: Estimated p-value:  0.266


In [108]:
p_value = bootstrap_significance_testing(hier_r_old_golds, hier_rct_old_preds, hier_tr_old_preds, metric, n=int(1e4))
print("+LastQR vs. +RCLastQ: Estimated p-value: ", p_value)

delta 0.03099210135506625
diff 0.03099210135506625
macro A 0.6038209621588688
macro B 0.5883249114813357
s 2041
n 10000
+LastQR vs. +RCLastQ: Estimated p-value:  0.2041


In [None]:
p_value = bootstrap_significance_testing(hier_r_old_golds, hier_rct_old_preds, hier_cr_old_preds, metric, n=int(1e4))
print("+CR vs. +RCLastQ: Estimated p-value: ", p_value)

### 2-inputs w/ Hit Order

In [26]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_hr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +HR: Estimated p-value: ", p_value)

delta 0.02230272833368141
diff 0.02230272833368141
macro A 0.5822363552507218
macro B 0.5710849910838811
s 2059
n 10000
Hier vs. +HR: Estimated p-value:  0.2059


### 2-inputs w/ Question

In [82]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_fr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +FirstqandrestR: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_rf_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RFirstqandrest: Estimated p-value: " + str(p_value))

delta -0.003333713072726452
diff -0.003333713072726452
macro A 0.5694181345475179
macro B 0.5710849910838811
s 5411
n 10000
Hier vs. +FirstqandrestR: Estimated p-value: 0.5411
delta -0.04994534022174246
diff -0.04994534022174246
macro A 0.5461123209730099
macro B 0.5710849910838811
s 9461
n 10000
Hier vs. +RFirstqandrest: Estimated p-value: 0.9461


In [83]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_er_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +LastqandrestR: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_re_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RLastqandrest: Estimated p-value: " + str(p_value))

delta -0.030690487931488875
diff -0.030690487931488875
macro A 0.5557397471181367
macro B 0.5710849910838811
s 8255
n 10000
Hier vs. +LastqandrestR: Estimated p-value: 0.8255
delta -0.04995546190815081
diff -0.04995546190815081
macro A 0.5461072601298057
macro B 0.5710849910838811
s 9335
n 10000
Hier vs. +RLastqandrest: Estimated p-value: 0.9335


In [84]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_tr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +LastqR: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_rt_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RLastq: Estimated p-value: " + str(p_value))

delta -0.024628840775199334
diff -0.024628840775199334
macro A 0.5587705706962814
macro B 0.5710849910838811
s 7751
n 10000
Hier vs. +LastqR: Estimated p-value: 0.7751
delta -0.024628840775199334
diff -0.024628840775199334
macro A 0.5587705706962814
macro B 0.5710849910838811
s 7862
n 10000
Hier vs. +RLastq: Estimated p-value: 0.7862


In [85]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_ar_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +AllqR: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_ra_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RAllq: Estimated p-value: " + str(p_value))

delta -0.019257619660831837
diff -0.019257619660831837
macro A 0.5614561812534652
macro B 0.5710849910838811
s 7435
n 10000
Hier vs. +AllqR: Estimated p-value: 0.7435
delta -0.0446299258196432
diff -0.0446299258196432
macro A 0.5487700281740595
macro B 0.5710849910838811
s 9367
n 10000
Hier vs. +RAllq: Estimated p-value: 0.9367


In [86]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_2r_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +Last2R: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_r2_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RLast2: Estimated p-value: " + str(p_value))

delta -0.005181227821587697
diff -0.005181227821587697
macro A 0.5684943771730873
macro B 0.5710849910838811
s 5638
n 10000
Hier vs. +Last2R: Estimated p-value: 0.5638
delta -0.07071547593573668
diff -0.07071547593573668
macro A 0.5357272531160128
macro B 0.5710849910838811
s 9880
n 10000
Hier vs. +RLast2: Estimated p-value: 0.988


In [87]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_3r_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +Last3R: Estimated p-value: " + str(p_value))
p_value = bootstrap_significance_testing(hier_r_golds, hier_r3_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RLas32: Estimated p-value: " + str(p_value))

delta -0.06387616525617634
diff -0.06387616525617634
macro A 0.5391469084557929
macro B 0.5710849910838811
s 9812
n 10000
Hier vs. +Last3R: Estimated p-value: 0.9812
delta -0.07809628732813456
diff -0.07809628732813456
macro A 0.5320368474198138
macro B 0.5710849910838811
s 9939
n 10000
Hier vs. +RLas32: Estimated p-value: 0.9939


### 2-inputs Q Speaker

In [31]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rk_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RK: Estimated p-value: " + str(p_value))

delta 0.0007263802692760724
diff 0.0007263802692760724
macro A 0.5714481812185191
macro B 0.5710849910838811
s 4885
n 10000
Hier vs. +RK: Estimated p-value: 0.4885


In [40]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_kr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +KR: Estimated p-value: " + str(p_value))

delta 0.002129818307006426
diff 0.002129818307006426
macro A 0.5721499002373843
macro B 0.5710849910838811
s 4782
n 10000
Hier vs. +KR: Estimated p-value: 0.4782


In [42]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rp_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RP: Estimated p-value: " + str(p_value))

delta 0.016899353827634078
diff 0.016899353827634078
macro A 0.5795346679976981
macro B 0.5710849910838811
s 2633
n 10000
Hier vs. +RP: Estimated p-value: 0.2633


In [32]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rl_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RL: Estimated p-value: " + str(p_value))

delta 0.013312284586448175
diff 0.013312284586448175
macro A 0.5777411333771052
macro B 0.5710849910838811
s 3039
n 10000
Hier vs. +RL: Estimated p-value: 0.3039


### 2-inputs w/ Questioner Coarse-grained sentiment

In [33]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QR: Estimated p-value: " + str(p_value))

delta 0.036548009922330094
diff 0.036548009922330094
macro A 0.5893589960450462
macro B 0.5710849910838811
s 797
n 10000
Hier vs. +QR: Estimated p-value: 0.0797


In [35]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQ: Estimated p-value: " + str(p_value))

delta 0.04006283554468015
diff 0.04006283554468015
macro A 0.5911164088562212
macro B 0.5710849910838811
s 899
n 10000
Hier vs. +RQ: Estimated p-value: 0.0899


### 2-inputs w/ Questioner Intents

In [37]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_ri_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RI: Estimated p-value: " + str(p_value))

delta -0.004490778830823716
diff -0.004490778830823716
macro A 0.5688396016684693
macro B 0.5710849910838811
s 5563
n 10000
Hier vs. +RI: Estimated p-value: 0.5563


In [38]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_ir_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +IR: Estimated p-value: " + str(p_value))

delta 0.008550738544601755
diff 0.008550738544601755
macro A 0.575360360356182
macro B 0.5710849910838811
s 3885
n 10000
Hier vs. +IR: Estimated p-value: 0.3885


In [75]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_tr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +Last QuestionR: Estimated p-value: " + str(p_value))

delta -0.024628840775199334
diff -0.024628840775199334
macro A 0.5587705706962814
macro B 0.5710849910838811
s 7771
n 10000
Hier vs. +Last QuestionR: Estimated p-value: 0.7771


### 2-inputs w/ Fine-grained sentiment

In [110]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rs_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RS: Estimated p-value: " + str(p_value))

p_value = bootstrap_significance_testing(hier_r_golds, hier_sr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +SR: Estimated p-value: " + str(p_value))

delta 0.10053229397760588
diff 0.10053229397760588
macro A 0.621351138072684
macro B 0.5710849910838811
s 4
n 10000
Hier vs. +RS: Estimated p-value: 0.0004
delta 0.1168268072765466
diff 0.1168268072765466
macro A 0.6294983947221544
macro B 0.5710849910838811
s 0
n 10000
Hier vs. +SR: Estimated p-value: 0.0


In [112]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_sr_preds, hier_rc_preds, metric, n=int(1e4))
print("+RC vs. +SR: Estimated p-value: " + str(p_value))

delta 0.027868335565156555
diff 0.027868335565156555
macro A 0.6294983947221544
macro B 0.6155642269395761
s 1463
n 10000
+RC vs. +SR: Estimated p-value: 0.1463


### 2-inputs w/ Coarse-grained sentiment

In [43]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RC: Estimated p-value: " + str(p_value))

delta 0.08895847171139004
diff 0.08895847171139004
macro A 0.6155642269395761
macro B 0.5710849910838811
s 13
n 10000
Hier vs. +RC: Estimated p-value: 0.0013


In [44]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_cr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CR: Estimated p-value: " + str(p_value))

delta 0.06311529165530261
diff 0.06311529165530261
macro A 0.6026426369115324
macro B 0.5710849910838811
s 192
n 10000
Hier vs. +CR: Estimated p-value: 0.0192


### 3-inputs w/ Coarse-grained sentiment

In [45]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rcf_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RCF: Estimated p-value: " + str(p_value))

delta 0.0792131336521602
diff 0.0792131336521602
macro A 0.6106915579099612
macro B 0.5710849910838811
s 87
n 10000
Hier vs. +RCF: Estimated p-value: 0.0087


In [46]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rcf_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +RCF: Estimated p-value: " + str(p_value))

delta -0.009745338059229836
diff -0.009745338059229836
macro A 0.6106915579099612
macro B 0.6155642269395761
s 6382
n 10000
RC vs. +RCF: Estimated p-value: 0.6382


In [48]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_frc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +FRC: Estimated p-value: " + str(p_value))

delta 0.07765203907501839
diff 0.07765203907501839
macro A 0.6099110106213903
macro B 0.5710849910838811
s 135
n 10000
Hier vs. +FRC: Estimated p-value: 0.0135


In [49]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_frc_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +FRC: Estimated p-value: " + str(p_value))

delta -0.011306432636371655
diff -0.011306432636371655
macro A 0.6099110106213903
macro B 0.6155642269395761
s 6492
n 10000
RC vs. +FRC: Estimated p-value: 0.6492


In [50]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_crq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CRQ: Estimated p-value: " + str(p_value))

delta 0.08522208920140639
diff 0.08522208920140639
macro A 0.6136960356845843
macro B 0.5710849910838811
s 52
n 10000
Hier vs. +CRQ: Estimated p-value: 0.0052


In [51]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_crq_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +CRQ: Estimated p-value: " + str(p_value))

delta -0.003736382509983649
diff -0.003736382509983649
macro A 0.6136960356845843
macro B 0.6155642269395761
s 5426
n 10000
RC vs. +FRC: Estimated p-value: 0.5426


In [66]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_cqr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CQR: Estimated p-value: " + str(p_value))

delta 0.05760335344306422
diff 0.05760335344306422
macro A 0.5998866678054132
macro B 0.5710849910838811
s 238
n 10000
Hier vs. +CQR: Estimated p-value: 0.0238


In [67]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_cqr_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +CQR: Estimated p-value: " + str(p_value))

delta -0.03135511826832582
diff -0.03135511826832582
macro A 0.5998866678054132
macro B 0.6155642269395761
s 8892
n 10000
RC vs. +CQR: Estimated p-value: 0.8892


In [52]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQC: Estimated p-value: " + str(p_value))

delta 0.09206676138242442
diff 0.09206676138242442
macro A 0.6171183717750933
macro B 0.5710849910838811
s 21
n 10000
Hier vs. +RQC: Estimated p-value: 0.0021


In [53]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqc_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +RQC: Estimated p-value: " + str(p_value))

delta 0.003108289671034381
diff 0.003108289671034381
macro A 0.6171183717750933
macro B 0.6155642269395761
s 4550
n 10000
RC vs. +RQC: Estimated p-value: 0.455


In [68]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rcq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RCQ: Estimated p-value: " + str(p_value))

delta 0.06811597609524056
diff 0.06811597609524056
macro A 0.6051429791315014
macro B 0.5710849910838811
s 63
n 10000
Hier vs. +RCQ: Estimated p-value: 0.0063


In [69]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rcq_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +RCQ: Estimated p-value: " + str(p_value))

delta -0.02084249561614948
diff -0.02084249561614948
macro A 0.6051429791315014
macro B 0.6155642269395761
s 7793
n 10000
RC vs. +RCQ: Estimated p-value: 0.7793


In [70]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qrc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QRC: Estimated p-value: " + str(p_value))

delta 0.06266392130885579
diff 0.06266392130885579
macro A 0.602416951738309
macro B 0.5710849910838811
s 208
n 10000
Hier vs. +QRC: Estimated p-value: 0.0208


In [71]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qrc_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +QRC: Estimated p-value: " + str(p_value))

delta -0.026294550402534256
diff -0.026294550402534256
macro A 0.602416951738309
macro B 0.6155642269395761
s 8381
n 10000
RC vs. +QRC: Estimated p-value: 0.8381


In [72]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qcr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QCR: Estimated p-value: " + str(p_value))

delta 0.07433689892193174
diff 0.07433689892193174
macro A 0.608253440544847
macro B 0.5710849910838811
s 77
n 10000
Hier vs. +QCR: Estimated p-value: 0.0077


In [73]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qcr_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +QCR: Estimated p-value: " + str(p_value))

delta -0.014621572789458304
diff -0.014621572789458304
macro A 0.608253440544847
macro B 0.6155642269395761
s 7073
n 10000
RC vs. +QCR: Estimated p-value: 0.7073


In [54]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_crh_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CRH: Estimated p-value: " + str(p_value))

delta 0.09593505096779587
diff 0.09593505096779587
macro A 0.619052516567779
macro B 0.5710849910838811
s 8
n 10000
Hier vs. +CRH: Estimated p-value: 0.0008


In [55]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_crh_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +CRH: Estimated p-value: " + str(p_value))

delta 0.006976579256405824
diff 0.006976579256405824
macro A 0.619052516567779
macro B 0.6155642269395761
s 3915
n 10000
RC vs. +CRH: Estimated p-value: 0.3915


In [56]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rch_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RCH: Estimated p-value: " + str(p_value))

delta 0.09297433382624232
diff 0.09297433382624232
macro A 0.6175721579970023
macro B 0.5710849910838811
s 8
n 10000
Hier vs. +RCH: Estimated p-value: 0.0008


In [57]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rch_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +RCH: Estimated p-value: " + str(p_value))

delta 0.0040158621148522755
diff 0.0040158621148522755
macro A 0.6175721579970023
macro B 0.6155642269395761
s 4362
n 10000
RC vs. +RCH: Estimated p-value: 0.4362


In [59]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqhc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQHC: Estimated p-value: " + str(p_value))

delta 0.08853341867088305
diff 0.08853341867088305
macro A 0.6153517004193226
macro B 0.5710849910838811
s 25
n 10000
Hier vs. +RQHC: Estimated p-value: 0.0025


In [60]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqhc_preds, hier_rc_preds, metric, n=int(1e4))
print("RC vs. +RQHC: Estimated p-value: " + str(p_value))

delta -0.0004250530405069952
diff -0.0004250530405069952
macro A 0.6153517004193226
macro B 0.6155642269395761
s 5116
n 10000
RC vs. +RQHC: Estimated p-value: 0.5116
