In [1]:
# Randomized statistical testing

In [2]:
import numpy as np
import os
import subprocess
import sys
import random
import timeit
from scipy import stats
import itertools

In [3]:
# run_dev_algo_A = './bioasq_dir/run_bioasq_linearModel_test_filtered'
# run_dev_algo_B = './bioasq_dir/run_bm25_bioasq_test_filtered'

In [4]:
# qrels_file = './robust_dir/s1/robust_test_s1_qrels'
# trec_eval_command = '../../eval/trec_eval'

# run_dev_algo_A = './robust_dir/s1/run_robust_s1_best_lmart_test'

# run_dev_algo_B = './robust_dir/s1/run_bm25_robust_test_s1'
# # run_dev_algo_B = run_dev_algo_A

In [5]:
def comp(x):
    return 1 - abs(x)

In [6]:
def meanAP(list_X):
    return np.mean([x for x in  list_X])

In [7]:
def map_A_B(paired_list):
    map_A = meanAP([x[1] for x in paired_list])
    map_B = meanAP([x[2] for x in paired_list])
    return [map_A, map_B]

In [8]:
def list_to_str(int_list):
    string = ""
    int_list = [str(x) for x in int_list] 
    return string.join(int_list)

In [9]:
def get_run_avgs(trec_eval_command, metric, qrel, qret):
    
    params = ['-q', '-m']
    toolkit_parameters = [
                            trec_eval_command,
                            *params,
                            metric,
                            qrel,
                            qret]

#     print(toolkit_parameters)

    proc = subprocess.Popen(toolkit_parameters, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False)
    (out, err) = proc.communicate()
##     print(out.decode("utf-8"))
#     print('Run error: ', err)
    if err == None:
        pass
#         print('No errors')
    out_split = out.decode("utf-8").replace('\tall\t','').splitlines()[:-1]
    out_dict = {item.split()[1]:float(item.split()[2]) for item in out_split}
    return out_dict

In [10]:
def get_paired_list(dict_A, dict_B):
    if not (set(dict_A.keys())  == set(dict_B.keys())):
        print('Queries sets are different!')
        return
    paired_list = []
    for k in dict_A.keys():
        paired_list.append([k, dict_A[k], dict_B[k]])
    return paired_list

In [11]:
def permute_paired_list(paired_list, b_filter):    
    return [[p[0], p[b+1], p[comp(b)+1]] for p,b in zip(paired_list, b_filter)]

In [12]:
def get_p_value(permuted_maps, observed_value):
    permuted_diff = [x[0] - x[1] for x in permuted_maps]
    observed_value
    count = 0
    for i in permuted_diff:
        if (i < -abs(observed_value)) or (i > abs(observed_value)):
            count += 1
#     print(count)
    p_value = count / len(permuted_maps)
    return [observed_value, p_value]

In [13]:
def compute_pvalue(paired_list, alpha= 0.05, max_iter=20000, min_i=1000):
    '''Compute randomized two-tailed significance testing'''
    
    seen_filters = set()
    permuted_maps = []
    permuted_lists = []
    for i in range(0, max_iter):
        b_filter = list(np.random.randint(2, size=(len(paired_list),)))

        while list_to_str(b_filter) in seen_filters:
            b_filter = list(np.random.randint(2, size=(len(paired_list),)))
    #         clear_output()
            print('repeated')

        perm_list = permute_paired_list(paired_list, b_filter)
        permuted_lists.append(perm_list)

        seen_filters.add(list_to_str(b_filter))
        permuted_maps.append(map_A_B(perm_list))

        maps_two_algorithms = map_A_B(paired_list)
        map_diff_test = maps_two_algorithms[0] - maps_two_algorithms[1]
        map_diff_test
        
        [obs_value, pvalue] = get_p_value(permuted_maps, map_diff_test)

        if i > min_i:
            if (pvalue < 0.01) or (pvalue > 0.1):
                break
        if i % 1000 == 0:
            pass
#             print(pvalue)
            
    # Compare against Student t-test 1sample
    
    np.random.seed(12345678)

    rs_diff = [x[1] - x[2] for x in paired_list]

    # rvs1 = [x[1] for x in paired_list]
    # rvs2 = [x[2] for x in paired_list]
    [t_statistic, t_pvalue] = stats.ttest_1samp(rs_diff,0)
    
    
    if pvalue < alpha:
        sign_flag = True
    else:
        sign_flag = False
    
    return {'Significant': str(sign_flag),
            'rand_pvalue':pvalue,
            't_pvalue': t_pvalue,
            'Metric diff': obs_value,            
            't_statistic': t_statistic
        }

In [14]:
# For each dataset and hpo method


bio_test_runs = ['./deep-relevance-ranking/models/baselines/bioasq_dir/bioasq_test_qrels', # qrel
                 './deep-relevance-ranking/models/baselines/bioasq_dir/run_bm25_bioasq_test_filtered', # bm25
                 './deep-relevance-ranking/models/baselines/bioasq_dir/run_bioasq_linearModel_test_filtered', # bm25+extra
                 './deep-relevance-ranking/models/baselines/posit_results4/qret.txt', # Deep model
                 './deep-relevance-ranking/models/baselines/bioasq_dir/run_bioasq_best_lmart_test_leaves15_lr0.07_n750' # lambdaMart                  
                ]

folds = ['s1', 's2', 's3', 's4', 's5']
robust_test_runs = [['./deep-relevance-ranking/models/baselines/robust_dir/' + f + '/robust_test_' + f + '_qrels', # qrel
                 './deep-relevance-ranking/models/baselines/robust_dir/' + f + '/run_bm25_robust_test_' + f, # bm25 
                 './deep-relevance-ranking/models/baselines/robust_dir/' + f + '/run_robust_linearModel_test_' + f, # bm25+extra 
#                  '', # Deep model
                 './deep-relevance-ranking/models/baselines/robust_dir/' + f + '/run_robust_' + f + '_best_lmart_test_leaves25_lr0.03_n450' # lambdaMart                  
                ] for f in folds]


tvqa_test_runs = ['./TVQA/workdir/gold_answer_qrels_test', # qrel
                 './TVQA/workdir/retrieved_files/run_tfidf_test', # baseline 
                 './TVQA/deep_results/run_deep_test', # Deep model, check the results when it finishes the training
                 './TVQA/workdir/retrieved_files/run_best_lmart_test_leaves5_lr0.44_n1350' # lambdaMart                  
                ]

file_dirs = [bio_test_runs, *robust_test_runs, tvqa_test_runs]
# file_dirs = [bio_test_runs,  tvqa_test_runs]

In [15]:
trec_eval_command = './trec_eval/trec_eval'

In [16]:
max_iter = 20000

min_i =1000
alpha = 0.05

# initial_b_filter = [1] * 400

start_time = timeit.default_timer()
dict_A = {}
dict_B = {}
for fdir in file_dirs:
    
    qrels_file = fdir[0]    
    if 'TVQA' in qrels_file:
        metric = 'success.1'
    else:
        metric = 'map'
    comb_folder = list(itertools.combinations(fdir[1:],2))
    
    if ('robust' in qrels_file) and (any(x in qrels_file for x in ['s2', 's3', 's4', 's5'])):        
        pass
    else:
        dict_A = {}
        dict_B = {}
        
    for comb in comb_folder:
        model_A = comb[0]
        model_B = comb[1]
        
        
        dict_part_A = get_run_avgs(trec_eval_command, metric, qrels_file, model_A)
        dict_part_B = get_run_avgs(trec_eval_command, metric, qrels_file, model_B)
                            
        if ('robust' in qrels_file) and not ('s5' in qrels_file):
            dict_A.update(dict_part_A)
            dict_B.update(dict_part_B)
            
            continue
        elif ('robust' in qrels_file) and  ('s5' in qrels_file):
            dict_A.update(dict_part_A)
            dict_B.update(dict_part_B)
            print("HEHE: ", len(dict_A), len(dict_B))

        
        dict_A.update(dict_part_A)
        dict_B.update(dict_part_B)
        
#         print("HEHE2: ", len(dict_A))
#         print("HEHE2: ", len(dict_B))
        
        paired_list = get_paired_list(dict_A, dict_B)
        
        metric_A = meanAP(list(dict_A.values()))
        metric_B = meanAP(list(dict_B.values()))
        
        results = compute_pvalue(paired_list, alpha, max_iter, min_i=1000)
        
        print('Model A: ' + model_A.split('/')[-2:][-1].replace('s5', 'summed'), '| ', metric.upper(), ': ' , round(metric_A,4))
        
        print('Model B: ' + model_B.split('/')[-2:][-1].replace('s5', 'summed'), '| ', metric.upper(), ': ', round(metric_B,4))
        print(results)
#         print('Time spent: ', timeit.default_timer() - start_time, '\n')

Model A: run_bm25_bioasq_test_filtered |  MAP :  0.4598
Model B: run_bioasq_linearModel_test_filtered |  MAP :  0.4641
{'Significant': 'False', 'rand_pvalue': 0.5449101796407185, 't_pvalue': 0.5168462929992957, 'Metric diff': -0.004243249999999976, 't_statistic': -0.6487916502629775}
Model A: run_bm25_bioasq_test_filtered |  MAP :  0.4598
Model B: qret.txt |  MAP :  0.4763
{'Significant': 'True', 'rand_pvalue': 0.001996007984031936, 't_pvalue': 0.0033938463770196046, 'Metric diff': -0.016519249999999985, 't_statistic': -2.9473005326272705}
Model A: run_bm25_bioasq_test_filtered |  MAP :  0.4598
Model B: run_bioasq_best_lmart_test_leaves15_lr0.07_n750 |  MAP :  0.4692
{'Significant': 'False', 'rand_pvalue': 0.1536926147704591, 't_pvalue': 0.14501867696864665, 'Metric diff': -0.009346999999999939, 't_statistic': -1.460212231257566}
Model A: run_bioasq_linearModel_test_filtered |  MAP :  0.4641
Model B: qret.txt |  MAP :  0.4763
{'Significant': 'False', 'rand_pvalue': 0.10778443113772455,