In [108]:
import logging
import sys

def write_run(model_name, data, out_f,
              max_objects_per_query=sys.maxsize,
              skip_sorting=False):
    """
    Write a run to an output file.
    Parameters:
        - model_name: identifier of run.
        - data: dictionary mapping topic_id to object_assesments;
            object_assesments is an iterable (list or tuple) of
            (relevance, object_id) pairs.
            The object_assesments iterable is sorted by decreasing order.
        - out_f: output file stream.
        - max_objects_per_query: cut-off for number of objects per query.
    """
    for subject_id, object_assesments in data.items():
        if not object_assesments:
            logging.warning('Received empty ranking for %s; ignoring.',
                            subject_id)

            continue

        # Probe types, to make sure everything goes alright.
        # assert isinstance(object_assesments[0][0], float) or \
        #     isinstance(object_assesments[0][0], np.float32)
        assert isinstance(object_assesments[0][1], str) or \
            isinstance(object_assesments[0][1], bytes)

        if not skip_sorting:
            object_assesments = sorted(object_assesments, reverse=True)

        if max_objects_per_query < sys.maxsize:
            object_assesments = object_assesments[:max_objects_per_query]

        if isinstance(subject_id, bytes):
            subject_id = subject_id.decode('utf8')

        for rank, (relevance, object_id) in enumerate(object_assesments):
            if isinstance(object_id, bytes):
                object_id = object_id.decode('utf8')

            out_f.write(
                '{subject} Q0 {object} {rank} {relevance} '
                '{model_name}\n'.format(
                    subject=subject_id,
                    object=object_id,
                    rank=rank + 1,
                    relevance=relevance,
                    model_name=model_name))
            
# The following writes the run to standard output.
# In your code, you should write the runs to local
# storage in order to pass them to trec_eval.
write_run(
    model_name="PLM",
    data=PLM_scores,
    out_f=open("results/PLM_scores.run", "w"),
    max_objects_per_query=1000)

NameError: name 'PLM_scores' is not defined

In [2]:
import subprocess
import re
r = re.compile(r'([^ \\t]*)\\t*')

def create_output(type_set, filename):
    if type_set == 'test':
        command = "trec_eval -m all_trec -q ap_88_89/qrel_test "
    else:
        command = "trec_eval -m all_trec -q ap_88_89/qrel_validation "
    command +=  "results/" + filename #+" | grep -E '\sall\s'"
    
    output = str(subprocess.check_output(command, shell = True))
    return output

In [90]:
def analyse_output(output, title):
    # NDCG@10, Mean Average Precision (MAP@1000), Precision@5 and Recall@1000.
    measure_results = {}
    measures = ["ndcg_cut_10", ["100"]], ["map_cut_1000",[]], ["P_5", ["500", "relative"]], ["recall_1000",[]]
    for measure in measures:
        measure_list = []
        measure_all = 0
        for line in output.split():
            if measure[0] in line:
                clean = True
                for restriction in measure[1]:
                    if restriction in line:
                        clean = False
                if clean:
                    if "tall" in line:
                        measure_all = r.findall(line)[-1]
                    else:
                        measure_list.append(float(r.findall(line)[-1]))
        measure_results[measure[0]] = measure_all, measure_list

    return [title, measure_results]

In [6]:
outd2v = create_output('test', 'tfidf_doc2vec.run')
resd2v = analyse_output(outd2v, "TF-IDF Doc2Vec")

In [7]:
print(resd2v[0])
for key,value in resd2v[1].items():
    print(str(key)+':', value[0])
print('')

TF-IDF Doc2Vec
ndcg_cut_10: 0.0586
P_5: 0.0025
map_cut_1000: 0.0233
recall_1000: 0.3279



In [8]:
outdir = create_output('test', 'dirichlet_scores_test.run')
resdir = analyse_output(outdir, "Dirichlet")

In [9]:
print(resdir[0])
for key,value in resdir[1].items():
    print(str(key)+':', value[0])
print('')

Dirichlet
ndcg_cut_10: 0.4139
P_5: 0.0179
map_cut_1000: 0.2045
recall_1000: 0.5511



In [12]:
outw2v = create_output('test', 'test.run')
resw2v = analyse_output(outw2v, "Word to Vec")

In [13]:
print(resw2v[0])
for key,value in resw2v[1].items():
    print(str(key)+':', value[0])
print('')

Word to Vec
ndcg_cut_10: 0.0281
P_5: 0.0027
map_cut_1000: 0.0271
recall_1000: 0.3338



In [91]:
outd08t = create_output('test', 'AD_scores_test.run')
resd08t = analyse_output(outd08t, "AD delta=0.8")
print(resd08t[0])
ad = [[],[],[],[]]
for key,value in resd08t[1].items():
    ind = metrics.index(key)
    ad[ind] = value[1]

outl02t = create_output('test', 'jelinek_scores_test.run')
resl02t = analyse_output(outl02t, "Jelinek lamb=0.2")
print(resl02t[0])
jelinek = [[],[],[],[]]
for key,value in resl02t[1].items():
    ind = metrics.index(key)
    jelinek[ind] = value[1]

output2000t = create_output('test', 'dirichlet_scores_test.run')
measure_results2000t = analyse_output(output2000t, "Dirichlet mu=2000")
print(measure_results2000t[0])
dirichlet = [[],[],[],[]]
for key,value in measure_results2000t[1].items():
    ind = metrics.index(key)
    dirichlet[ind] = value[1]

AD delta=0.8
Jelinek lamb=0.2
Dirichlet mu=2000


In [92]:
outtf = create_output('test', 'TFIDFrun.run')
restf = analyse_output(outtf, "TFidf")
print(restf[0])
tfidf = [[],[],[],[]]
for key,value in restf[1].items():
    ind = metrics.index(key)
    tfidf[ind] = value[1]

outbm = create_output('test', 'BM25run.run')
resbm = analyse_output(outbm, "BM25")
print(resbm[0])
bm25 = [[],[],[],[]]
for key,value in resbm[1].items():
    ind = metrics.index(key)
    bm25[ind] = value[1]

TFidf
BM25


In [94]:
outw2v = create_output('test', 'test.run')
resw2v = analyse_output(outw2v, "Word to Vec")
print(resw2v[0])
w2v = [[],[],[],[]]
for key,value in resw2v[1].items():
    ind = metrics.index(key)
    w2v[ind] = value[1]

outd2v = create_output('test', 'tfidf_doc2vec.run')
resd2v = analyse_output(outd2v, "TF-IDF Doc2Vec")
print(resd2v[0])
d2v = [[],[],[],[]]
for key,value in resd2v[1].items():
    ind = metrics.index(key)
    d2v[ind] = value[1]

Word to Vec
TF-IDF Doc2Vec


In [122]:
outlsi = create_output('test', 'tfidf_lsi_2.run')
reslsi = analyse_output(outlsi, "TF-IDF LSI")
print(reslsi[0])
lsi = [[],[],[],[]]
for key,value in reslsi[1].items():
    ind = metrics.index(key)
    lsi[ind] = value[1]
    print(key, value[0])

TF-IDF LSI
ndcg_cut_10 0.1457
P_5 0.0034
map_cut_1000 0.0759
recall_1000 0.4588


In [109]:
outplm = create_output('test', 'PLM_scores.run')
resplm = analyse_output(outplm, "PLM")
print(resplm[0])
plm = [[],[],[],[]]
for key,value in resplm[1].items():
    ind = metrics.index(key)
    plm[ind] = value[1]
    print(key, value[0])

PLM
ndcg_cut_10 0.3878
P_5 0.0178
map_cut_1000 0.1903
recall_1000 0.5298


In [123]:
results = [tfidf, bm25, jelinek, dirichlet, plm, ad, w2v, lsi, d2v]
print(len(results[0][3]))

120


In [20]:
alpha = 0.05
from scipy import stats

In [23]:
metrics = ["ndcg_cut_10", "map_cut_1000", "P_5","recall_1000"]

In [124]:
import numpy as np
methods = ['tf-idf','bm25','jelinek','dirichlet','plm','ad','w2v','lsi','d2v']
# results = [[method1:[prec],[map],[rec],[dcg]],
#            [method2:[prec],[map],[rec],[dcg]],
#            [method3:[prec],[map],[rec],[dcg]],
#            [method4:[prec],[map],[rec],[dcg]],
#            [method5:[prec],[map],[rec],[dcg]],
#            [method6:[prec],[map],[rec],[dcg]],
#            [method7:[prec],[map],[rec],[dcg]],
#            [method8:[prec],[map],[rec],[dcg]]]
           
# for metric in range(len(metrics)):
#     for method1 in range(len(methods)):
#         for method2 in range(len(methods)):
#             if method1 != method2:
#                 list_a = results[method1][metric]
#                 list_b = results[method2][metric]
#                 pvalue = scipy.stats.ttest_ind(list_a, list_b)[-1]
#                 if pvalue < alpha:    
#                     print('significant:', pvalue, '\t',methods[method1],'and', methods[method2], 'for', metrics[metric])
#                 else:
#                     print('NOT sign:', round(pvalue,3), '\t\t', methods[method1], 'and', methods[method2], 'for', metrics[metric])
            
for method1 in range(len(methods)):
    for method2 in range(len(methods)):
        if method1 != method2:
            pvalues = []
            for metric in range(len(metrics)):
                
                list_a = results[method1][metric]
                list_b = results[method2][metric]
                print(metrics[metric], methods[method1],np.mean(list_a))
                print(metrics[metric], methods[method2],np.mean(list_b))
                pvalue = scipy.stats.ttest_ind(list_a, list_b)[-1]
                pvalues.append(pvalue)
            
            product = 1
            for p in pvalues:
                product *= (1-p)
            new_p = 1-product
            if new_p < alpha:
                print('significant:', round(new_p,6), '\t',methods[method1],'and', methods[method2])
            else:
                print('NOT sign:', round(new_p,3), '\t\t', methods[method1], 'and', methods[method2])
            print('\n\n')
            

ndcg_cut_10 tf-idf 0.440425210084
ndcg_cut_10 bm25 0.430823529412
map_cut_1000 tf-idf 0.206748333333
map_cut_1000 bm25 0.208953333333
P_5 tf-idf 0.012825
P_5 bm25 0.0112641666667
recall_1000 tf-idf 0.56226
recall_1000 bm25 0.556365
NOT sign: 1.0 		 tf-idf and bm25



ndcg_cut_10 tf-idf 0.440425210084
ndcg_cut_10 jelinek 0.377674789916
map_cut_1000 tf-idf 0.206748333333
map_cut_1000 jelinek 0.187563333333
P_5 tf-idf 0.012825
P_5 jelinek 0.012275
recall_1000 tf-idf 0.56226
recall_1000 jelinek 0.538071666667
NOT sign: 0.987 		 tf-idf and jelinek



ndcg_cut_10 tf-idf 0.440425210084
ndcg_cut_10 dirichlet 0.414692436975
map_cut_1000 tf-idf 0.206748333333
map_cut_1000 dirichlet 0.204464166667
P_5 tf-idf 0.012825
P_5 dirichlet 0.0178525
recall_1000 tf-idf 0.56226
recall_1000 dirichlet 0.551121666667
NOT sign: 0.997 		 tf-idf and dirichlet



ndcg_cut_10 tf-idf 0.440425210084
ndcg_cut_10 plm 0.388782352941
map_cut_1000 tf-idf 0.206748333333
map_cut_1000 plm 0.190301666667
P_5 tf-idf 0.012825
P