In [2]:
%load_ext autoreload
%autoreload 2
import json
from scipy import sparse
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import string
import os
import scipy.stats as stats

np.random.seed(12345678) # fix random seed to get same numbers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Read qrels - map qid to list of relevant docs

In [53]:
def loadQidRelDocs(inputFile):
    qidRelDocs = defaultdict(list) 

    i = 0 
    with open(inputFile) as f:
        for line in f:
            i+=1
            if i % 1000000 ==0 :
                print("Processed lines: ", i, time.ctime())
            resultLine = line.split()
            queryId = int(resultLine[0])
            current_docid = resultLine[2]
            relevance = int(resultLine[3])
            
            if relevance == 1:  
                qidRelDocs[queryId].append(current_docid) 
            
    return qidRelDocs

In [54]:
relDocAll = loadQidRelDocs("AH-MONO-FR-CLEF2000-2006.qrel.sorted")

In [55]:
print(len(relDocAll))
relDocAll[1]

333


['LEMONDE94-000038-19941101',
 'LEMONDE94-000323-19941004',
 'LEMONDE94-000324-19941004',
 'LEMONDE94-000516-19941004',
 'LEMONDE94-001403-19940308',
 'LEMONDE94-001945-19940216',
 'LEMONDE94-002458-19940219',
 'LEMONDE94-002762-19940524']

# 2. Read res files and collect rank (inc by 1, because they start from 0)

In [82]:
def getRelDocRanks(inputFile, relDocAll):
    qidRelDocRanks = defaultdict(list) 

    i = 0 
    with open(inputFile) as f:
        for line in f:
            i+=1
            if i % 1000000 ==0 :
                print("Processed lines: ", i, time.ctime())
            resultLine = line.split()
            queryId = int(resultLine[0])
            current_docid = resultLine[2]
            rank = int(resultLine[3]) + 1
            
            if current_docid in relDocAll[queryId]:  
                qidRelDocRanks[queryId].append(rank) 
                
#             if queryId not in qidRelDocRanks:
#                 qidRelDocRanks[queryId]
                
    return qidRelDocRanks

In [83]:
qidRelDocRanks = getRelDocRanks("results/BM25P-CLEF-FR-bm25-plain-all-queries-p-10-alpha-0.res", relDocAll)

In [84]:
def avgRank(qidRelDocRanks, over_all_queries=True):
    if over_all_queries:
        suma = 0
        for k, v in qidRelDocRanks.items():
            suma += np.mean(v)
        return (suma/len(qidRelDocRanks))
    else:
        avgRankPerQuery = dict()
        for k, v in qidRelDocRanks.items():
            avgRankPerQuery[k] = np.mean(v)
        return avgRankPerQuery

In [85]:
query_rank = avgRank(qidRelDocRanks, False)
np.mean(list(query_rank.values()))
query_rank

# print(len(query_rank.keys()))

{1: 64.714285714285708,
 3: 299.33333333333331,
 5: 379.85714285714283,
 6: 75.333333333333329,
 7: 194.63157894736841,
 8: 119.0,
 9: 531.44444444444446,
 10: 179.84210526315789,
 11: 349.47058823529414,
 12: 150.5,
 13: 296.0,
 15: 519.16666666666663,
 17: 9.0,
 18: 507.0,
 19: 267.33333333333331,
 20: 406.69696969696969,
 21: 538.0,
 22: 28.0,
 24: 524.0,
 25: 499.0,
 26: 406.0,
 29: 236.33333333333334,
 30: 34.5,
 31: 306.30000000000001,
 32: 456.0,
 34: 319.5,
 35: 10.0,
 37: 147.0,
 39: 143.90909090909091,
 40: 239.33333333333334,
 41: 38.0,
 42: 217.625,
 43: 209.0,
 44: 34.0,
 45: 92.978723404255319,
 46: 471.08333333333331,
 47: 254.73809523809524,
 48: 683.75,
 49: 164.18181818181819,
 50: 139.36734693877551,
 51: 238.0,
 52: 416.0,
 53: 393.0,
 55: 138.72289156626505,
 56: 139.11363636363637,
 57: 134.49367088607596,
 58: 27.045454545454547,
 59: 116.66666666666667,
 60: 592.57142857142856,
 61: 505.69999999999999,
 62: 25.285714285714285,
 63: 36.823529411764703,
 65: 66.0,

In [60]:
query_rank = avgRank(qidRelDocRanks)

# 3. DO it for all files

In [61]:
# Settings
# dirEval = "./results-2005-2006/"
dirEval = "./results/"
baseTypes = ["top5-idf", 
             "top5-tfidf",
             "top10-idf",
             "top10-tfidf"]
alphas = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
ending = ".res"

In [62]:
metric_dict = defaultdict()

for filename in os.listdir(dirEval):
    if filename.endswith(ending):
#         print(filename)
        name_items = filename.split("-")
        baseType = name_items[3]+"-"+name_items[4]
        alpha = filename.split(".")[0].split("-")[10]
        qidRelDocRanks = getRelDocRanks(dirEval+filename, relDocAll)
        metric_dict[(baseType, int(alpha))] = avgRank(qidRelDocRanks)

In [63]:
metric_dict

defaultdict(None,
            {('bm25-plain', 0): 178.11662152866126,
             ('bm25-plain', 1): 178.11662152866126,
             ('top10-idf', 1): 219.28860396058721,
             ('top10-idf', 5): 194.43268975804466,
             ('top10-idf', 10): 187.12584537841903,
             ('top10-idf', 15): 181.33441188617522,
             ('top10-idf', 20): 180.73190102830472,
             ('top10-idf', 25): 180.87313269077526,
             ('top10-idf', 30): 181.48083151927517,
             ('top10-idf', 35): 180.92720941234808,
             ('top10-idf', 40): 181.14192208411305,
             ('top10-idf', 45): 181.10757993144935,
             ('top10-idf', 50): 181.57057435017842,
             ('top10-tfidf', 1): 222.96976003981126,
             ('top10-tfidf', 5): 193.69095717296071,
             ('top10-tfidf', 10): 181.11295916368752,
             ('top10-tfidf', 15): 181.4455749053995,
             ('top10-tfidf', 20): 180.5465797673013,
             ('top10-tfidf', 25): 180.8653

In [64]:
for k,v in metric_dict.items():
    if v < metric_dict[("bm25-plain",0)]:
        print(k,v)

In [65]:
metric_dict_per_query = defaultdict()

for filename in os.listdir(dirEval):
    if filename.endswith(ending):
#         print(filename)
        name_items = filename.split("-")
        baseType = name_items[3]+"-"+name_items[4]
        alpha = filename.split(".")[0].split("-")[10]
        qidRelDocRanks = getRelDocRanks(dirEval+filename, relDocAll)
        metric_dict_per_query[(baseType, int(alpha))] = avgRank(qidRelDocRanks, False)

# Check if statistically significant

In [99]:
# compare BM25 with  ('top5-tfidf', 20)
# data comes from dictionary which is not ordered. we MUST order values according to queryID

a_metric_dict = metric_dict_per_query[('top5-tfidf', 15)]
a_metric_list = sorted(list(a_metric_dict.items()), key=lambda x: x[0])


b_metric_dict = metric_dict_per_query[('bm25-plain', 0)]
b_metric_list = sorted(list(b_metric_dict.items()), key=lambda x: x[0])


intersection = a_metric_dict.keys() & b_metric_dict.keys()
# print(len(intersection))
a = [v for k,v in a_metric_list if k in intersection]
print(np.mean(a))
b = [v for k,v in b_metric_list if k in intersection]
print(np.mean(b))


print(stats.ttest_ind(a, b))
print(stats.ttest_rel(a, b))

177.407985121
175.908807173
Ttest_indResult(statistic=0.11138183495296421, pvalue=0.91135032283777151)
Ttest_relResult(statistic=0.3482218773218565, pvalue=0.72791457175136765)


In [106]:
# compare BM25 with  ('top5-tfidf', 20)

baseline_metric_dict = metric_dict_per_query[('bm25-plain', 0)]
baseline_metric_list = sorted(list(b_metric_dict.items()), key=lambda x: x[0])

for k in metric_dict_per_query.keys():
    print("\x1b[0m "+str(k), end="  ")
    a_metric_dict = metric_dict_per_query[k]
    a_metric_list = sorted(list(a_metric_dict.items()), key=lambda x: x[0])
    
    intersection = a_metric_dict.keys() & baseline_metric_dict.keys()
    a = [v for k,v in a_metric_list if k in intersection]
    b = [v for k,v in baseline_metric_list if k in intersection]

    print("\x1b[0m "+str(np.mean(a))+ " "+ str(np.mean(b)), end="  ") 
    
#     result = stats.ttest_ind(a, b, equal_var=False)
    result = stats.ttest_rel(a, b)
    print("\x1b[0m "+str(result), end=" ")
    print("\x1b[31m Statistically diff" if result[1]<0.05 else "\x1b[0m ")

[0m ('top5-tfidf', 15)  [0m 177.407985121 175.908807173  [0m Ttest_relResult(statistic=0.3482218773218565, pvalue=0.72791457175136765) [0m 
[0m ('top10-idf', 45)  [0m 177.888965964 175.908807173  [0m Ttest_relResult(statistic=0.34601266865454977, pvalue=0.7295725336617026) [0m 
[0m ('bm25-plain', 1)  [0m 178.116621529 178.116621529  [0m Ttest_relResult(statistic=nan, pvalue=nan) [0m 
[0m ('top5-tfidf', 30)  [0m 177.177935262 175.908807173  [0m Ttest_relResult(statistic=0.23343401858060753, pvalue=0.81558150630583148) [0m 
[0m ('top10-idf', 50)  [0m 178.674547213 175.908807173  [0m Ttest_relResult(statistic=0.4717835908715452, pvalue=0.63741970147675198) [0m 
[0m ('top10-tfidf', 25)  [0m 176.798704486 178.116621529  [0m Ttest_relResult(statistic=-0.27711691261717963, pvalue=0.78187805593331716) [0m 
[0m ('top5-tfidf', 35)  [0m 176.613284414 175.908807173  [0m Ttest_relResult(statistic=0.12617110666051867, pvalue=0.89967988621359951) [0m 
[0m ('top10-tfidf', 