In [None]:
# Questions we aim to answer in the analysis
# 1) Behavior on top relevant documents [How many of the top documents for this system were relevant and could they be categorized and distinguished from others?]
# 2) Behavior on top non-relevant documents [Why were the top non-relevant documents retrieved?] Behavior on unretrieved relevant documents [Why weren’t these relevant documents retrieved within the top 1000?]
# x) Beadplot observations [How does the ranking (especially among the top 50 documents) of this system compare to all other systems?]
# 3) Base Query observations [What did the system think were the important terms of the original query, and were they good?]
# 4) Expanded Query observations [If the system expanded the query (4 out of 6 systems did), what were the important terms of the expansion, and were they helpful?]
# 5) Blunders of system [What obvious mistakes did the system make that it could have easily avoided? Examples might be bad stemming of words or bad handling of hyphenation] Other features of note [Anything else.]
# 6) What should system to do improve performance? [The individual’s conclusion as to why the system did not retrieve well, and recommendations as to what would have made a better retrieval.]
# 7) What added information would help performance? How can system get that information? [Is there implicit information in the query, that a human would understand but the system didn’t? Examples might be world knowledge (like Germany is part of Europe).]

In [None]:
import pandas as pd

# Reads the output of a model.
# Lines in the output should be in the form [query_id, doc_id, rank] with sep = '\t'.
def read_results_tsv(loc):
    d = pd.read_csv(loc, sep='\t', header=None, names=['query_id', 'doc_id', 'rank', 'score'])
    return d

# Reads the output of a model.
# Lines in the output should be in the form [query_id, doc_id, rank] with sep=' '.
def read_results_csv(loc):
    d = pd.read_csv(loc, sep=' ', header=None, names=['query_id', 'doc_id', 'rank', 'score'])
    return d

# Gets the ranking of a query.
def get_ranking_by_query_id(d, query_id):
    ranking = d.loc[d.query_id == query_id][['doc_id', 'rank']].sort_values(by=['rank'])['doc_id'].tolist()
    return ranking

d = read_results_csv("data/bm25-msmarco-test.csv")

In [None]:
import numpy as np

# Reads the relevant documents from the given qrels file.
def read_qrels(loc):
    d = pd.read_csv(loc, names=['query_id', 'Q0', 'doc_id', 'rating'], sep=' ', header=None)
    del d['Q0']
    return d

# Gets the relevant document for the given query id.
def get_relevant_doc_ids(qrels, query_id):
    doc_ids = qrels.loc[(qrels.query_id == query_id)][['doc_id', 'rating']]
    return doc_ids

# Constructs a vector which counts the number of retrieved documents for each rating.
def get_relevance_vector(qrels, results):
    v = np.zeros(qrels.rating.max() + 1)
    
    for query_id in qrels.query_id.unique():
        print(query_id)
        ranking = get_ranking_by_query_id(d, query_id)
        relevant = get_relevant_doc_ids(qrels, query_id)
        for doc_id in ranking:
            rating = relevant.loc[relevant.doc_id == doc_id]['rating']
            v[rating] += 1
    return v

qrels = read_qrels('../anserini/collections/msmarco-passage/2019qrels-pass.txt')

# For some reason, qrels contains less unique query ids, i.e., not every query has relevant items (by a long shot).
# print(qrels.query_id.unique())
# print(d.query_id.unique())

In [None]:
# 1) Behavior on top relevant documents [How many of the top documents for this system were relevant and could they be categorized and distinguished from others?]
print(get_relevance_vector(qrels, d))