In [1]:
import json
import numpy as np
from sklearn import metrics


In [2]:
# adapted from: https://github.com/RobertInjac/Master-thesis/
def bootstrap_significance_testing(y_true, y_predA, y_predB, metric, n=int(1e5)):
    """
    Perform bootstrap significance testing.
    
    Null hypothesis is: A is no better than B on the population as a whole.
    Alternative hypothesis: A is better than B on the population as a whole.
    
    The return value is the p-value for this test.
    The bootstrap estimates the p-value though a combination of simulation and approximation.
    
    A small p-value indicates strong evidence against the null hypothesis.
    In other words, it gives evidence that A is better than B.
    
    Explaination in detail (section 2.2. Boostrap):
    Berg-Kirkpatrick, Taylor, David Burkett, and Dan Klein. "An empirical investigation of statistical significance in nlp."
    Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning.
    Association for Computational Linguistics, 2012.
    
    :param y_true: 
    :param y_predA: predictions of model A
    :param y_predB: predictions of model B
    :param metric: used metric, has to be a function of form f(y_true, y_pred)
    :param n: integer; the number of times to perform bootstrap resampling
    """
    v1 = metric(y_true, y_predA)
    v2 = metric(y_true, y_predB)
    d = (v1 - v2) * 2 # A has to beat B by at least 2* delta(x)
    s = 0 # number of times A beats B
    print('diff', d)
    print('macro A', v1)
    print('macro B', v2)
    l = len(y_true)
    for i in range(n):
        idx = np.random.choice(l, l, replace=True) # sample indices with replacement
        #print('iter', i)
        v1i = metric(y_true[idx], y_predA[idx])
        v2i = metric(y_true[idx], y_predB[idx])
        di = v1i - v2i
        if di > d:
            s += 1
    return s / n
    

In [3]:
def get_golds_preds(file_name, is_fine=True):
    file_suffix = "_test"
    file_suffix_fine = file_suffix + "_fine"
    json_gold_pred = "label_set_info (id/gold/pred)"
    json_gold_pred_fine = "id_gold_pred_fine"


    n_folds = 4

    json_files = []
    for fold in range(n_folds):
        if is_fine:
            json_file = file_name + str(fold) + file_suffix_fine
        else:
            json_file = file_name + str(fold) + file_suffix
        json_files.append(json_file)

    if is_fine:
        json_gold_pred = json_gold_pred_fine

    id_golds_preds = []    
    for json_file in json_files:
        with open(json_file, 'r') as json_data:
            id_golds_preds.extend(json.load(json_data)[json_gold_pred])

    golds = []
    preds = []
    for _, gold, pred in id_golds_preds:
        golds.append(gold)
        preds.append(pred)

    return np.array(golds), np.array(preds)

In [4]:
metric = lambda predA, predB: metrics.f1_score(predA, predB, average='macro')

In [6]:
base_r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/1_input/metrics_roberta_classification_r_text_test.json_fold"

base_rq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_r_text_q_text_last_question_test.json_fold"
base_qr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_q_text_last_question_r_text_test.json_fold"

base_rs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_r_text_gold_sentiments_num_test.json_fold"
base_sr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_gold_sentiments_num_r_text_test.json_fold"
base_rc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_r_text_gold_sentiments_coarse_num_test.json_fold"
base_cr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/2_input/metrics_roberta_classification_gold_sentiments_coarse_num_r_text_test.json_fold"

base_rsq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_r_text_gold_sentiments_num_q_text_last_question_test.json_fold"
base_rqs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_r_text_q_text_last_question_gold_sentiments_num_test.json_fold"
base_srq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_gold_sentiments_num_r_text_q_text_last_question_test.json_fold"
base_sqr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_gold_sentiments_num_q_text_last_question_r_text_test.json_fold"
base_qrs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_q_text_last_question_r_text_gold_sentiments_num_test.json_fold"
base_qsr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_q_text_last_question_gold_sentiments_num_r_text_test.json_fold"

base_rcq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_r_text_gold_sentiments_coarse_num_q_text_last_question_test.json_fold"
base_rqc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_r_text_q_text_last_question_gold_sentiments_coarse_num_test.json_fold"
base_crq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_gold_sentiments_coarse_num_r_text_q_text_last_question_test.json_fold"
base_cqr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_gold_sentiments_coarse_num_q_text_last_question_r_text_test.json_fold"
base_qrc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_q_text_last_question_r_text_gold_sentiments_coarse_num_test.json_fold"
base_qcr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_context/3_input/metrics_roberta_classification_q_text_last_question_gold_sentiments_coarse_num_r_text_test.json_fold"

base_r_golds, base_r_preds = get_golds_preds(base_r_file, is_fine=False)

base_rq_golds, base_rq_preds = get_golds_preds(base_rq_file, is_fine=False)
base_qr_golds, base_qr_preds = get_golds_preds(base_qr_file, is_fine=False)

base_rs_golds, base_rs_preds = get_golds_preds(base_rs_file, is_fine=False)
base_sr_golds, base_sr_preds = get_golds_preds(base_sr_file, is_fine=False)
base_rc_golds, base_rc_preds = get_golds_preds(base_rc_file, is_fine=False)
base_cr_golds, base_cr_preds = get_golds_preds(base_cr_file, is_fine=False)

base_rsq_golds, base_rsq_preds = get_golds_preds(base_rsq_file, is_fine=False)
base_rqs_golds, base_rqs_preds = get_golds_preds(base_rqs_file, is_fine=False)
base_srq_golds, base_srq_preds = get_golds_preds(base_srq_file, is_fine=False)
base_sqr_golds, base_sqr_preds = get_golds_preds(base_sqr_file, is_fine=False)
base_qrs_golds, base_qrs_preds = get_golds_preds(base_qrs_file, is_fine=False)
base_qsr_golds, base_qsr_preds = get_golds_preds(base_qsr_file, is_fine=False)

base_rcq_golds, base_rcq_preds = get_golds_preds(base_rcq_file, is_fine=False)
base_rqc_golds, base_rqc_preds = get_golds_preds(base_rqc_file, is_fine=False)
base_crq_golds, base_crq_preds = get_golds_preds(base_crq_file, is_fine=False)
base_cqr_golds, base_cqr_preds = get_golds_preds(base_cqr_file, is_fine=False)
base_qrc_golds, base_qrc_preds = get_golds_preds(base_qrc_file, is_fine=False)
base_qcr_golds, base_qcr_preds = get_golds_preds(base_qcr_file, is_fine=False)

majority_preds = np.array([[1]*len(base_r_golds[0])]*len(base_r_golds))

### Sanity check w/ Majority

In [10]:
p_value = bootstrap_significance_testing(base_r_golds, base_r_preds, majority_preds, metric, n=int(1e4))
print("Majority vs. Base: Estimated p-value: ", p_value)

diff 0.3720433038331673
macro A 0.5620672515120448
macro B 0.3760455995954612
Majority vs. Base: Estimated p-value:  0.0


### 2-inputs w/ Last Question

In [11]:
p_value = bootstrap_significance_testing(base_r_golds, base_rq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RQuestion: Estimated p-value: ", p_value)

diff 0.027176275497404756
macro A 0.5756553892607472
macro B 0.5620672515120448
Base vs. +RQuestion: Estimated p-value:  0.1662


In [12]:
p_value = bootstrap_significance_testing(base_r_golds, base_qr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +QuestionR: Estimated p-value: ", p_value)

diff 0.008013947620708839
macro A 0.5660742253223993
macro B 0.5620672515120448
Base vs. +QuestionR: Estimated p-value:  0.3966


### 2-inputs w/ Fine-grained sentiment

In [13]:
p_value = bootstrap_significance_testing(base_r_golds, base_rs_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RSent: Estimated p-value: " + str(p_value))

diff -0.0014150345944847764
macro A 0.5613597342148025
macro B 0.5620672515120448
Base vs. +RSent: Estimated p-value: 0.5175


In [14]:
p_value = bootstrap_significance_testing(base_r_golds, base_sr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +SentR: Estimated p-value: " + str(p_value))

diff 0.008006343256639736
macro A 0.5660704231403647
macro B 0.5620672515120448
Base vs. +SentR: Estimated p-value: 0.3968


### 2-inputs w/ Coarse-grained sentiment

In [15]:
p_value = bootstrap_significance_testing(base_r_golds, base_rc_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RC: Estimated p-value: " + str(p_value))

diff 0.015872733947837503
macro A 0.5700036184859636
macro B 0.5620672515120448
Base vs. +RC: Estimated p-value: 0.3051


In [16]:
p_value = bootstrap_significance_testing(base_r_golds, base_cr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +CR: Estimated p-value: " + str(p_value))

diff 0.031203433166767214
macro A 0.5776689680954284
macro B 0.5620672515120448
Base vs. +CR: Estimated p-value: 0.1651


### 3-inputs w/ Fine-grained sentiment 

In [17]:
p_value = bootstrap_significance_testing(base_r_golds, base_rsq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RSQ: Estimated p-value: " + str(p_value))

diff 0.029802939851991894
macro A 0.5769687214380408
macro B 0.5620672515120448
Base vs. +RSQ: Estimated p-value: 0.1961


In [18]:
p_value = bootstrap_significance_testing(base_r_golds, base_rqs_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RQS: Estimated p-value: " + str(p_value))

diff 0.01709260926989331
macro A 0.5706135561469915
macro B 0.5620672515120448
Base vs. +RQS: Estimated p-value: 0.3232


In [19]:
p_value = bootstrap_significance_testing(base_r_golds, base_srq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +SRQ: Estimated p-value: " + str(p_value))

diff 0.01812868971198478
macro A 0.5711315963680372
macro B 0.5620672515120448
Base vs. +SRQ: Estimated p-value: 0.3017


In [20]:
p_value = bootstrap_significance_testing(base_r_golds, base_sqr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +SQR: Estimated p-value: " + str(p_value))

diff 0.02407802536103043
macro A 0.5741062641925601
macro B 0.5620672515120448
Base vs. +SQR: Estimated p-value: 0.2229


In [21]:
p_value = bootstrap_significance_testing(base_r_golds, base_qrs_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +QRS: Estimated p-value: " + str(p_value))

diff 0.05426686228918709
macro A 0.5892006826566384
macro B 0.5620672515120448
Base vs. +QRS: Estimated p-value: 0.0467


In [22]:
p_value = bootstrap_significance_testing(base_r_golds, base_qsr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +QSR: Estimated p-value: " + str(p_value))

diff 0.08308571333692494
macro A 0.6036101081805073
macro B 0.5620672515120448
Base vs. +QSR: Estimated p-value: 0.0093


### 3 inputs w/ Coarse-grained sentiment

In [24]:
p_value = bootstrap_significance_testing(base_r_golds, base_rcq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RCQ: Estimated p-value: " + str(p_value))

diff 0.06128379538801787
macro A 0.5927091492060538
macro B 0.5620672515120448
Base vs. +RCQ: Estimated p-value: 0.044


In [25]:
p_value = bootstrap_significance_testing(base_r_golds, base_rqc_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +RQC: Estimated p-value: " + str(p_value))

diff 0.0022255134797510845
macro A 0.5631800082519204
macro B 0.5620672515120448
Base vs. +RQC: Estimated p-value: 0.478


In [26]:
p_value = bootstrap_significance_testing(base_r_golds, base_crq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +CRQ: Estimated p-value: " + str(p_value))

diff 0.041623542120078794
macro A 0.5828790225720842
macro B 0.5620672515120448
Base vs. +CRQ: Estimated p-value: 0.107


In [27]:
p_value = bootstrap_significance_testing(base_r_golds, base_cqr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +CQR: Estimated p-value: " + str(p_value))

diff 0.02952227308508326
macro A 0.5768283880545865
macro B 0.5620672515120448
Base vs. +CQR: Estimated p-value: 0.1728


In [28]:
p_value = bootstrap_significance_testing(base_r_golds, base_qrc_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +QRC: Estimated p-value: " + str(p_value))

diff -0.0014526508637502378
macro A 0.5613409260801697
macro B 0.5620672515120448
Base vs. +QRC: Estimated p-value: 0.5121


In [29]:
p_value = bootstrap_significance_testing(base_r_golds, base_qcr_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. +QCR: Estimated p-value: " + str(p_value))

diff -0.008625056380368923
macro A 0.5577547233218604
macro B 0.5620672515120448
Base vs. +QCR: Estimated p-value: 0.6063


## Hierarchical

In [30]:
hier_r_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/1_input/metrics_roberta_hierarchical_r_text_test.json_fold"

hier_rq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_r_text_q_text_last_question_test.json_fold"
hier_qr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_q_text_last_question_r_text_test.json_fold"

hier_rs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_r_text_gold_sentiments_num_test.json_fold"
hier_sr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_gold_sentiments_num_r_text_test.json_fold"
hier_rc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_test.json_fold"
hier_cr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_test.json_fold"

hier_nr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_q_speaker_r_text_test.json_fold"
hier_lr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/2_input/metrics_roberta_hierarchical_q_speaker_role_r_text_test.json_fold"


hier_rsq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_num_q_text_last_question_test.json_fold"
hier_rqs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_r_text_q_text_last_question_gold_sentiments_num_test.json_fold"
hier_srq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_gold_sentiments_num_r_text_q_text_last_question_test.json_fold"
hier_sqr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_gold_sentiments_num_q_text_last_question_r_text_test.json_fold"
hier_qrs_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_q_text_last_question_r_text_gold_sentiments_num_test.json_fold"
hier_qsr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_q_text_last_question_gold_sentiments_num_r_text_test.json_fold"

hier_rcq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_r_text_gold_sentiments_coarse_num_q_text_last_question_test.json_fold"
hier_rqc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_r_text_q_text_last_question_gold_sentiments_coarse_num_test.json_fold"
hier_crq_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_r_text_q_text_last_question_test.json_fold"
hier_cqr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_gold_sentiments_coarse_num_q_text_last_question_r_text_test.json_fold"
hier_qrc_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_q_text_last_question_r_text_gold_sentiments_coarse_num_test.json_fold"
hier_qcr_file = "/Users/elisa/Documents/CompLing/congressional_hearing/results/roberta_classification/roberta_test_hier/3_input/metrics_roberta_hierarchical_q_text_last_question_gold_sentiments_coarse_num_r_text_test.json_fold"


hier_r_golds, hier_r_preds = get_golds_preds(hier_r_file, is_fine=True)

hier_rq_golds, hier_rq_preds = get_golds_preds(hier_rq_file, is_fine=True)
hier_qr_golds, hier_qr_preds = get_golds_preds(hier_qr_file, is_fine=True)

hier_rs_golds, hier_rs_preds = get_golds_preds(hier_rs_file, is_fine=True)
hier_sr_golds, hier_sr_preds = get_golds_preds(hier_sr_file, is_fine=True)
hier_rc_golds, hier_rc_preds = get_golds_preds(hier_rc_file, is_fine=True)
hier_cr_golds, hier_cr_preds = get_golds_preds(hier_cr_file, is_fine=True)

hier_rsq_golds, hier_rsq_preds = get_golds_preds(hier_rsq_file, is_fine=True)
hier_rqs_golds, hier_rqs_preds = get_golds_preds(hier_rqs_file, is_fine=True)
hier_srq_golds, hier_srq_preds = get_golds_preds(hier_srq_file, is_fine=True)
hier_sqr_golds, hier_sqr_preds = get_golds_preds(hier_sqr_file, is_fine=True)
hier_qrs_golds, hier_qrs_preds = get_golds_preds(hier_qrs_file, is_fine=True)
hier_qsr_golds, hier_qsr_preds = get_golds_preds(hier_qsr_file, is_fine=True)

hier_rcq_golds, hier_rcq_preds = get_golds_preds(hier_rcq_file, is_fine=True)
hier_rqc_golds, hier_rqc_preds = get_golds_preds(hier_rqc_file, is_fine=True)
hier_crq_golds, hier_crq_preds = get_golds_preds(hier_crq_file, is_fine=True)
hier_cqr_golds, hier_cqr_preds = get_golds_preds(hier_cqr_file, is_fine=True)
hier_qrc_golds, hier_qrc_preds = get_golds_preds(hier_qrc_file, is_fine=True)
hier_qcr_golds, hier_qcr_preds = get_golds_preds(hier_qcr_file, is_fine=True)

In [32]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_r_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. Hier: Estimated p-value: " + str(p_value))

diff 0.015253227607384456
macro A 0.5696938653157371
macro B 0.5620672515120448
Base vs. Hier: Estimated p-value: 0.3091


### 2-inputs w/ Last Question

In [50]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQuestion: Estimated p-value: ", p_value)

diff -0.03959201116125333
macro A 0.5498978597351104
macro B 0.5696938653157371
Hier vs. +RQuestion: Estimated p-value:  0.8814


In [51]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QuestionR: Estimated p-value: ", p_value)

diff 0.03726209233119726
macro A 0.5883249114813357
macro B 0.5696938653157371
Hier vs. +QuestionR: Estimated p-value:  0.1204


### 2-inputs w/ Questioner

In [50]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_nr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QuestionerR: Estimated p-value: ", p_value)

diff -0.03959201116125333
macro A 0.5498978597351104
macro B 0.5696938653157371
Hier vs. +RQuestion: Estimated p-value:  0.8814


### 2-inputs w/ Fine-grained sentiment

In [33]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rs_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RSent: Estimated p-value: " + str(p_value))

diff 0.023072767599450872
macro A 0.5812302491154625
macro B 0.5696938653157371
Hier vs. +RSent: Estimated p-value: 0.2143


In [34]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_sr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +SentR: Estimated p-value: " + str(p_value))

diff 0.00959648331317875
macro A 0.5744921069723264
macro B 0.5696938653157371
Hier vs. +SentR: Estimated p-value: 0.3721


### 2-inputs w/ Coarse-grained sentiment

In [36]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RC: Estimated p-value: " + str(p_value))

diff 0.03376292685933868
macro A 0.5865753287454064
macro B 0.5696938653157371
Hier vs. +RC: Estimated p-value: 0.1286


In [61]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_cr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CR: Estimated p-value: " + str(p_value))

diff 0.046093668920515585
macro A 0.5927406997759949
macro B 0.5696938653157371
Hier vs. +CR: Estimated p-value: 0.058264


### 2-inputs w/ Last Question compared to base

In [53]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rq_preds, base_r_preds, metric, n=int(1e4))
print("Base vs. Hier +RQuestion: Estimated p-value: ", p_value)

diff -0.02433878355386887
macro A 0.5498978597351104
macro B 0.5620672515120448
Base vs. Hier +RQuestion: Estimated p-value:  0.7765


In [54]:
p_value = bootstrap_significance_testing(base_r_golds, hier_qr_preds, hier_r_preds, metric, n=int(1e4))
print("Base vs. Hier +QuestionR: Estimated p-value: ", p_value)

diff 0.03726209233119726
macro A 0.5883249114813357
macro B 0.5696938653157371
Base vs. Hier +QuestionR: Estimated p-value:  0.1157


### 2-inputs w/ Fine-grained sentiment compared to base

In [55]:
p_value = bootstrap_significance_testing(base_r_golds, hier_rs_preds, hier_r_preds, metric, n=int(1e4))
print("Base vs. Hier +RSent: Estimated p-value: " + str(p_value))

diff 0.023072767599450872
macro A 0.5812302491154625
macro B 0.5696938653157371
Base vs. Hier +RSent: Estimated p-value: 0.2091


In [57]:
p_value = bootstrap_significance_testing(base_r_golds, hier_sr_preds, hier_r_preds, metric, n=int(1e4))
print("Base vs. Hier +SentR: Estimated p-value: " + str(p_value))

diff 0.00959648331317875
macro A 0.5744921069723264
macro B 0.5696938653157371
Base vs. Hier +SentR: Estimated p-value: 0.3879


### 2-inputs w/ Coarse-grained sentiment compared to base

In [58]:
p_value = bootstrap_significance_testing(base_r_golds, hier_rc_preds, hier_r_preds, metric, n=int(1e4))
print("Base vs. Hier +RC: Estimated p-value: " + str(p_value))

diff 0.03376292685933868
macro A 0.5865753287454064
macro B 0.5696938653157371
Base vs. Hier +RC: Estimated p-value: 0.127


In [59]:
p_value = bootstrap_significance_testing(base_r_golds, hier_cr_preds, hier_r_preds, metric, n=int(1e4))
print("Base vs. Hier +CR: Estimated p-value: " + str(p_value))

diff 0.046093668920515585
macro A 0.5927406997759949
macro B 0.5696938653157371
Base vs. Hier +CR: Estimated p-value: 0.0593


### 3-inputs w/ Fine-grained sentiment

In [38]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rsq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RSQ: Estimated p-value: " + str(p_value))

diff 0.08009207685397124
macro A 0.6097399037427227
macro B 0.5696938653157371
Hier vs. +RSQ: Estimated p-value: 0.008


In [39]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqs_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQS: Estimated p-value: " + str(p_value))

diff 0.02067881697938967
macro A 0.5800332738054319
macro B 0.5696938653157371
Hier vs. +RQS: Estimated p-value: 0.2636


In [40]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_srq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +SRQ: Estimated p-value: " + str(p_value))

diff 0.042376207124506404
macro A 0.5908819688779903
macro B 0.5696938653157371
Hier vs. +SRQ: Estimated p-value: 0.0802


In [41]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_sqr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +SQR: Estimated p-value: " + str(p_value))

diff 0.03128913845335157
macro A 0.5853384345424129
macro B 0.5696938653157371
Hier vs. +SQR: Estimated p-value: 0.1644


In [42]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qrs_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QRS: Estimated p-value: " + str(p_value))

diff 0.029375152728924547
macro A 0.5843814416801993
macro B 0.5696938653157371
Hier vs. +QRS: Estimated p-value: 0.1804


In [43]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qsr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QSR: Estimated p-value: " + str(p_value))

diff 0.07143120724728891
macro A 0.6054094689393815
macro B 0.5696938653157371
Hier vs. +QSR: Estimated p-value: 0.0074


### 3 inputs w/ Coarse-grained sentiment

In [44]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rcq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RCQ: Estimated p-value: " + str(p_value))

diff 0.06825419368626351
macro A 0.6038209621588688
macro B 0.5696938653157371
Hier vs. +RCQ: Estimated p-value: 0.0145


In [45]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_rqc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +RQC: Estimated p-value: " + str(p_value))

diff 0.01568101391975607
macro A 0.5775343722756151
macro B 0.5696938653157371
Hier vs. +RQC: Estimated p-value: 0.3166


In [46]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_crq_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CRQ: Estimated p-value: " + str(p_value))

diff 0.038920157960475876
macro A 0.589153944295975
macro B 0.5696938653157371
Hier vs. +CRQ: Estimated p-value: 0.0916


In [47]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_cqr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +CQR: Estimated p-value: " + str(p_value))

diff 0.009179190098600243
macro A 0.5742834603650372
macro B 0.5696938653157371
Hier vs. +CQR: Estimated p-value: 0.3862


In [48]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qrc_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QRC: Estimated p-value: " + str(p_value))

diff 0.02620662886173175
macro A 0.582797179746603
macro B 0.5696938653157371
Hier vs. +QRC: Estimated p-value: 0.1914


In [49]:
p_value = bootstrap_significance_testing(hier_r_golds, hier_qcr_preds, hier_r_preds, metric, n=int(1e4))
print("Hier vs. +QCR: Estimated p-value: " + str(p_value))

diff -0.0008984884744893673
macro A 0.5692446210784924
macro B 0.5696938653157371
Hier vs. +QCR: Estimated p-value: 0.5045
