# Trec Eval and Significance Testing of all 10 Methods of Task 1 & 2

In [1]:
import logging
import sys
import scipy
from scipy import stats
import subprocess
import re
import numpy as np

Write run, get output and analyse output functions:

In [2]:
def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            
# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
# write_run(
#     model_name="PLM",
#     data=PLM_scores,
#     out_f=open("results/PLM_scores.run", "w"),
#     max_objects_per_query=1000)

In [3]:
r = re.compile(r'([^ \\t]*)\\t*')

def create_output(type_set, filename):
    if type_set == 'test':
        command = "trec_eval -m all_trec -q ap_88_89/qrel_test "
    else:
        command = "trec_eval -m all_trec -q ap_88_89/qrel_validation "
    command +=  "runfiles/" + filename #+" | grep -E '\sall\s'"
    
    output = str(subprocess.check_output(command, shell = True))
    return output

In [4]:
def analyse_output(output, title):
    # NDCG@10, Mean Average Precision (MAP@1000), Precision@5 and Recall@1000.
    measure_results = {}
    measures = ["ndcg_cut_10", ["100"]], ["map_cut_1000",[]], ["P_5", ["500", "relative"]], ["recall_1000",[]]
    for measure in measures:
        measure_list = []
        measure_all = 0
        for line in output.split():
            if measure[0] in line:
                clean = True
                for restriction in measure[1]:
                    if restriction in line:
                        clean = False
                if clean:
                    if "tall" in line:
                        measure_all = r.findall(line)[-1]
                    else:
                        measure_list.append(float(r.findall(line)[-1]))
        measure_results[measure[0]] = measure_all, measure_list

    return [title, measure_results]

# Getting all performance metrics of the models

In [5]:
metrics = ["ndcg_cut_10", "map_cut_1000", "P_5","recall_1000"]

In [6]:
# TF-IDF
outtf = create_output('test', 'TFIDFrun.run')
restf = analyse_output(outtf, "TFidf")
print(restf[0])
tfidf = [[],[],[],[]]
for key,value in restf[1].items():
    ind = metrics.index(key)
    tfidf[ind] = value[1]

#BM-25
outbm = create_output('test', 'BM25run.run')
resbm = analyse_output(outbm, "BM25")
print(resbm[0])
bm25 = [[],[],[],[]]
for key,value in resbm[1].items():
    ind = metrics.index(key)
    bm25[ind] = value[1]
    
#Jelinek-Mercer
outl02t = create_output('test', 'jelinek_scores_test.run')
resl02t = analyse_output(outl02t, "Jelinek lamb = 0.2")
print(resl02t[0])
jelinek = [[],[],[],[]]
for key,value in resl02t[1].items():
    ind = metrics.index(key)
    jelinek[ind] = value[1]

output2000t = create_output('test', 'dirichlet_scores_test.run')
measure_results2000t = analyse_output(output2000t, "Dirichlet mu = 2000")
print(measure_results2000t[0])
dirichlet = [[],[],[],[]]
for key,value in measure_results2000t[1].items():
    ind = metrics.index(key)
    dirichlet[ind] = value[1]
    
#Absolute Discounting
outd08t = create_output('test', 'AD_scores_test.run')
resd08t = analyse_output(outd08t, "AD delta = 0.8")
print(resd08t[0])
ad = [[],[],[],[]]
for key,value in resd08t[1].items():
    ind = metrics.index(key)
    ad[ind] = value[1]
    
#Positional Language Model
outplm = create_output('test', 'PLM_scores.run')
resplm = analyse_output(outplm, "PLM")
print(resplm[0])
plm = [[],[],[],[]]
for key,value in resplm[1].items():
    ind = metrics.index(key)
    plm[ind] = value[1]

#Word2Vec
outw2v = create_output('test', 'test.run')
resw2v = analyse_output(outw2v, "Word2Vec")
print(resw2v[0])
w2v = [[],[],[],[]]
for key,value in resw2v[1].items():
    ind = metrics.index(key)
    w2v[ind] = value[1]

#LSI
outlsi = create_output('test', 'tfidf_lsi_2.run')
reslsi = analyse_output(outlsi, "LSI")
print(reslsi[0])
lsi = [[],[],[],[]]
for key,value in reslsi[1].items():
    ind = metrics.index(key)
    lsi[ind] = value[1]    

#LDA    
outlda = create_output('test', 'tfidf_lda.run')
reslda = analyse_output(outlda, "LDA")
print(reslda[0])
lda = [[],[],[],[]]
for key,value in reslda[1].items():
    ind = metrics.index(key)
    lda[ind] = value[1] 
    
#Doc2Vec
outd2v = create_output('test', 'tfidf_doc2vec.run')
resd2v = analyse_output(outd2v, "Doc2Vec")
print(resd2v[0])
d2v = [[],[],[],[]]
for key,value in resd2v[1].items():
    ind = metrics.index(key)
    d2v[ind] = value[1]

TFidf
BM25
Jelinek lamb = 0.2
Dirichlet mu = 2000
AD delta = 0.8
PLM
Word2Vec
LSI
LDA
Doc2Vec


## Get P-values of 2tailed T-test and check difference in mean for alpha = 0.05

In [7]:
results = [tfidf, bm25, jelinek, dirichlet, plm, ad, w2v, lsi, lda, d2v]
methods = ['tf-idf','bm25','jelinek','dirichlet','plm','ad','w2v','lsi','lda','d2v']
alpha = 0.05

In [8]:
for method1 in range(len(methods)):
    for method2 in range(len(methods)):
        if method1 != method2:
            pvalues = []
            for metric in range(len(metrics)):
                
                list_a = results[method1][metric]
                list_b = results[method2][metric]
                pvalue = scipy.stats.ttest_ind(list_a, list_b)[-1]
                pvalues.append(pvalue)
            
            product = 1
            for p in pvalues:
                product *= (1-p)
            new_p = 1-product
            if new_p < alpha:
                print('significant:', round(new_p,6), '\t',methods[method1],'and', methods[method2])
            else:
                print('NOT sign:', round(new_p,3), '\t\t', methods[method1], 'and', methods[method2])
            

NOT sign: 1.0 		 tf-idf and bm25
NOT sign: 0.987 		 tf-idf and jelinek
NOT sign: 0.997 		 tf-idf and dirichlet
NOT sign: 0.907 		 tf-idf and plm
NOT sign: 0.999 		 tf-idf and ad
significant: 0.047854 	 tf-idf and w2v
NOT sign: 0.074 		 tf-idf and lsi
significant: 0.045423 	 tf-idf and lda
significant: 0.042768 	 tf-idf and d2v
NOT sign: 1.0 		 bm25 and tf-idf
NOT sign: 0.98 		 bm25 and jelinek
NOT sign: 0.998 		 bm25 and dirichlet
NOT sign: 0.898 		 bm25 and plm
NOT sign: 0.993 		 bm25 and ad
significant: 0.035327 	 bm25 and w2v
NOT sign: 0.067 		 bm25 and lsi
significant: 0.032871 	 bm25 and lda
significant: 0.030308 	 bm25 and d2v
NOT sign: 0.987 		 jelinek and tf-idf
NOT sign: 0.98 		 jelinek and bm25
NOT sign: 0.964 		 jelinek and dirichlet
NOT sign: 0.999 		 jelinek and plm
NOT sign: 0.999 		 jelinek and ad
NOT sign: 0.05 		 jelinek and w2v
NOT sign: 0.11 		 jelinek and lsi
significant: 0.047417 	 jelinek and lda
significant: 0.04453 	 jelinek and d2v
NOT sign: 0.997 		 dirichlet 