In [1]:
%load_ext autoreload
%autoreload 2
import json
from scipy import sparse
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import string
import os
import scipy.stats as stats

np.random.seed(12345678) # fix random seed to get same numbers

# 1. Read qrels - map qid to list of relevant docs

In [18]:
def loadQidRelDocs(inputFile):
    qidRelDocs = defaultdict(list) 

    i = 0 
    with open(inputFile) as f:
        for line in f:
#             print(line)
            i+=1
            if i % 1000000 ==0 :
                print("Processed lines: ", i, time.ctime())
            resultLine = line.split()
            queryId = int(resultLine[0])
            current_docid = resultLine[2]
            relevance = int(resultLine[3])
            
            if relevance == 1:  
                qidRelDocs[queryId].append(current_docid) 
                
    return qidRelDocs

In [19]:
relDocAll = loadQidRelDocs("AH-MONO-FR-CLEF-2005-2006.qrel.sorted")

In [21]:
print(len(relDocAll))
relDocAll.keys()
relDocAll[300]

99


['ATS.940102.0014',
 'ATS.940405.0090',
 'ATS.940420.0125',
 'ATS.940501.0060',
 'ATS.940912.0113',
 'ATS.941121.0142',
 'ATS.941203.0042',
 'ATS.941215.0159',
 'ATS.941216.0154',
 'ATS.941218.0057',
 'ATS.941219.0101',
 'ATS.950104.0099',
 'ATS.950214.0056',
 'ATS.950215.0061',
 'ATS.950305.0005',
 'ATS.950329.0145',
 'ATS.950410.0004',
 'ATS.950727.0083',
 'ATS.950918.0022',
 'ATS.950919.0083',
 'ATS.951203.0042',
 'LEMONDE94-000110-19941201',
 'LEMONDE94-000239-19940104',
 'LEMONDE95-011360']

# 2. Read res files and collect rank (inc by 1, because they start from 0)

In [37]:
def getRelDocRanks(inputFile, relDocAll):
    qidRelDocRanks = defaultdict(list) 

    i = 0 
    with open(inputFile) as f:
        for line in f:
            i+=1
            if i % 1000000 ==0 :
                print("Processed lines: ", i, time.ctime())
            resultLine = line.split()
            queryId = int(resultLine[0])
            current_docid = resultLine[2]
            rank = int(resultLine[3]) + 1
            
            if current_docid in relDocAll[queryId]:  
                qidRelDocRanks[queryId].append(rank) 
                
    return qidRelDocRanks

In [24]:
qidRelDocRanks = getRelDocRanks("results-2005-2006/BM25P-CLEF-FR-bm25-plain-2005-2006-queries-p-10-alpha-0.res", relDocAll)

In [25]:
def avgRank(qidRelDocRanks, over_all_queries=True):
    if over_all_queries:
        suma = 0
        for k, v in qidRelDocRanks.items():
            suma += np.mean(v)
        return (suma/len(qidRelDocRanks))
    else:
        avgRankPerQuery = dict()
        for k, v in qidRelDocRanks.items():
            avgRankPerQuery[k] = np.mean(v)
        return avgRankPerQuery

In [26]:
query_rank = avgRank(qidRelDocRanks, False)
np.mean(list(query_rank.values()))

181.10730822516308

In [27]:
query_rank = avgRank(qidRelDocRanks)

# 3. DO it for all files

In [29]:
# Settings
dirEval = "./results-2005-2006/"
baseTypes = ["top5-idf", 
             "top5-tfidf",
             "top10-idf",
             "top10-tfidf"]
alphas = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
ending = ".res"

In [30]:
metric_dict = defaultdict()

for filename in os.listdir(dirEval):
    if filename.endswith(ending):
#         print(filename)
        name_items = filename.split("-")
        baseType = name_items[3]+"-"+name_items[4]
        alpha = filename.split(".")[0].split("-")[11]
        qidRelDocRanks = getRelDocRanks(dirEval+filename, relDocAll)
        metric_dict[(baseType, int(alpha))] = avgRank(qidRelDocRanks)

In [31]:
metric_dict

defaultdict(None,
            {('bm25-plain', 0): 181.10730822516317,
             ('bm25-plain', 1): 181.10730822516317,
             ('top10-idf', 1): 231.565525974754,
             ('top10-idf', 5): 213.30380311906535,
             ('top10-idf', 10): 203.22338505741868,
             ('top10-idf', 15): 189.87767060309719,
             ('top10-idf', 20): 189.62952002913849,
             ('top10-idf', 25): 190.97021780113596,
             ('top10-idf', 30): 192.36397143925387,
             ('top10-idf', 35): 194.09495704284797,
             ('top10-idf', 40): 194.97989924272457,
             ('top10-idf', 45): 194.20967723473458,
             ('top10-idf', 50): 195.47581510191137,
             ('top10-tfidf', 1): 230.65917409766618,
             ('top10-tfidf', 5): 206.9306949494688,
             ('top10-tfidf', 10): 194.97061013629002,
             ('top10-tfidf', 15): 188.01568557451313,
             ('top10-tfidf', 20): 190.61725011532084,
             ('top10-tfidf', 25): 186.50774

In [32]:
for k,v in metric_dict.items():
    if v < metric_dict[("bm25-plain",0)]:
        print(k,v)

In [34]:
metric_dict_per_query = defaultdict()

for filename in os.listdir(dirEval):
    if filename.endswith(ending):
#         print(filename)
        name_items = filename.split("-")
        baseType = name_items[3]+"-"+name_items[4]
        alpha = filename.split(".")[0].split("-")[11]
        qidRelDocRanks = getRelDocRanks(dirEval+filename, relDocAll)
        metric_dict_per_query[(baseType, int(alpha))] = avgRank(qidRelDocRanks, False)

# Check if statistically significant

In [39]:
# compare BM25 with  ('top5-tfidf', 20)
# data comes from dictionary which is not ordered. we MUST order values according to queryID

a_metric_dict = metric_dict_per_query[('top5-tfidf', 15)]
a_metric_list = sorted(list(a_metric_dict.items()), key=lambda x: x[0])


b_metric_dict = metric_dict_per_query[('bm25-plain', 0)]
b_metric_list = sorted(list(b_metric_dict.items()), key=lambda x: x[0])


intersection = a_metric_dict.keys() & b_metric_dict.keys()
# print(len(intersection))
a = [v for k,v in a_metric_list if k in intersection]
print(np.mean(a))
b = [v for k,v in b_metric_list if k in intersection]
print(np.mean(b))


print(stats.ttest_ind(a, b))
print(stats.ttest_rel(a, b))

182.278155003
181.107308225
Ttest_indResult(statistic=0.050689978014932829, pvalue=0.95962876866641234)
Ttest_relResult(statistic=0.12528729197625801, pvalue=0.90057554805061968)


In [41]:
# compare BM25 with  ('top5-tfidf', 20)

baseline_metric_dict = metric_dict_per_query[('bm25-plain', 0)]
baseline_metric_list = sorted(list(b_metric_dict.items()), key=lambda x: x[0])

for k in metric_dict_per_query.keys():
    print("\x1b[0m "+str(k), end="  ")
    a_metric_dict = metric_dict_per_query[k]
    a_metric_list = sorted(list(a_metric_dict.items()), key=lambda x: x[0])
    
    intersection = a_metric_dict.keys() & baseline_metric_dict.keys()
    a = [v for k,v in a_metric_list if k in intersection]
    b = [v for k,v in baseline_metric_list if k in intersection]

    print("\x1b[0m "+str(np.mean(a))+ " "+ str(np.mean(b)), end="  ") 
    
#     result = stats.ttest_ind(a, b, equal_var=False)
    result = stats.ttest_rel(a, b)
    print("\x1b[0m "+str(result), end=" ")
    print("\x1b[31m Statistically diff" if result[1]<0.05 else "\x1b[0m ")

[0m ('top10-idf', 25)  [0m 185.969890524 181.107308225  [0m Ttest_relResult(statistic=0.44138113149119562, pvalue=0.65999569843657691) [0m 
[0m ('top5-idf', 35)  [0m 192.368859415 181.107308225  [0m Ttest_relResult(statistic=0.95136622757495615, pvalue=0.34396646716069867) [0m 
[0m ('top5-idf', 50)  [0m 196.822805143 181.107308225  [0m Ttest_relResult(statistic=1.2427890355035986, pvalue=0.21717374688356383) [0m 
[0m ('top5-tfidf', 30)  [0m 182.468641654 181.107308225  [0m Ttest_relResult(statistic=0.12486730508768772, pvalue=0.90090708724020052) [0m 
[0m ('top10-tfidf', 30)  [0m 181.634667991 181.107308225  [0m Ttest_relResult(statistic=0.052452113181520682, pvalue=0.95828469832226415) [0m 
[0m ('top10-idf', 45)  [0m 189.244948413 181.107308225  [0m Ttest_relResult(statistic=0.69886286454887703, pvalue=0.48643982286613929) [0m 
[0m ('top10-idf', 35)  [0m 189.12896756 181.107308225  [0m Ttest_relResult(statistic=0.67416296425205069, pvalue=0.50193564309211136