In [97]:
# Questions we aim to answer in the analysis
# 1) Behavior on top relevant documents [How many of the top documents for this system were relevant and could they be categorized and distinguished from others?]
# 2) Behavior on top non-relevant documents [Why were the top non-relevant documents retrieved?] Behavior on unretrieved relevant documents [Why weren’t these relevant documents retrieved within the top 1000?]
# x) Beadplot observations [How does the ranking (especially among the top 50 documents) of this system compare to all other systems?]
# 3) Base Query observations [What did the system think were the important terms of the original query, and were they good?]
# 4) Expanded Query observations [If the system expanded the query (4 out of 6 systems did), what were the important terms of the expansion, and were they helpful?]
# 5) Blunders of system [What obvious mistakes did the system make that it could have easily avoided? Examples might be bad stemming of words or bad handling of hyphenation] Other features of note [Anything else.]
# 6) What should system to do improve performance? [The individual’s conclusion as to why the system did not retrieve well, and recommendations as to what would have made a better retrieval.]
# 7) What added information would help performance? How can system get that information? [Is there implicit information in the query, that a human would understand but the system didn’t? Examples might be world knowledge (like Germany is part of Europe).]

In [98]:
import pandas as pd

# Reads the output of a model.
# Lines in the output should be in the form [query_id, doc_id, rank] with sep = '\t'.
def read_results_tsv(loc):
    d = pd.read_csv(loc, sep='\t', header=None, names=['query_id', 'doc_id', 'rank', 'score'])
    return d

# Reads the output of a model.
# Lines in the output should be in the form [query_id, doc_id, rank] with sep=' '.
def read_results_csv(loc):
    d = pd.read_csv(loc, sep=' ', header=None, names=['query_id', 'doc_id', 'rank', 'score'])
    return d

# Gets the ranking of a query.
def get_ranking_by_query_id(d, query_id):
    ranking = d.loc[d.query_id == query_id][['doc_id', 'rank']].sort_values(by=['rank'])['doc_id'].tolist()
    return ranking

d = read_results_csv("data/bm25-msmarco-test.csv")

In [99]:
import numpy as np

# Reads the relevant documents from the given qrels file.
def read_qrels(loc):
    d = pd.read_csv(loc, names=['query_id', 'Q0', 'doc_id', 'rating'], sep=' ', header=None)
    del d['Q0']
    return d

# Gets the relevant document for the given query id.
def get_relevant_doc_ids(qrels, query_id):
    doc_ids = qrels.loc[(qrels.query_id == query_id) & (qrels.rating >= 2)][['doc_id', 'rating']]
    return doc_ids

def get_recall_per_query(qrels, results, n):
    recalls = {}
    for query_id in qrels.query_id.unique():
        ranking = get_ranking_by_query_id(results, query_id)
        relevant = get_relevant_doc_ids(qrels, query_id)
        recalls[query_id] = 0
        for i in range(0, n):
            ratings = relevant.loc[relevant.doc_id == ranking[i]].rating.tolist()
            if len(ratings) > 0 and ratings[0] >= 2:
                recalls[query_id] += 1
    return recalls

# Constructs a vector which counts the number of retrieved documents for each rating.
def get_relevance_vector(qrels, results):
    v = np.zeros(qrels.rating.max() + 1)
    for query_id in qrels.query_id.unique():
        ranking = get_ranking_by_query_id(d, query_id)
        relevant = get_relevant_doc_ids(qrels, query_id)
        for doc_id in ranking:
            rating = relevant.loc[relevant.doc_id == doc_id]['rating']
            v[rating] += 1
    return v

# Given a ranking, return all documents that are relevant, but not in the ranking for the given query.
def get_relevant_doc_ids_not_retrieved(qrels, query_id, ranking):
    relevant_doc_ids = get_relevant_doc_ids(qrels, query_id).doc_id.tolist()
    relevant_doc_ids_not_retrieved = []
    for doc_id in relevant_doc_ids:
        if not doc_id in ranking:
            relevant_doc_ids_not_retrieved.append(doc_id)
    return relevant_doc_ids_not_retrieved

qrels = read_qrels('../anserini/collections/msmarco-passage/2019qrels-pass.txt')

# For some reason, qrels contains less unique query ids, i.e., not every query has relevant items (by a long shot).
# print(qrels.query_id.unique())
# print(d.query_id.unique())

In [100]:
def read_queries(loc):
    queries = pd.read_csv(loc, header=None, sep='\t', names=['query_id', 'string'])
    return queries

def get_query(queries, query_id):
    return queries.loc[queries.query_id == query_id].string.tolist()[0]

queries = read_queries("data/msmarco-test2019-queries.tsv")

In [101]:
from pyserini.index import IndexReader

index_loc = "../anserini/indexes/msmarco-passage/lucene-index-msmarco"
index = IndexReader(index_loc)

# Gets the document vector for the given doc_id.
def get_doc_vec(doc_id):
    if (type(doc_id) == type(0)):
        return index.get_document_vector("{}".format(doc_id))
    else:
        return index.get_document_vector(doc_id)

# Tokenizes a given query.
def tokenize(query):
    return index.analyze(query)
        

In [102]:
# There are on average ~215 documents per query in the qrels.
# This means that a model can only retrieve on average at most 215 documents.
print(len(qrels) / len(qrels.query_id.unique()))

215.34883720930233


### 1) Behavior on top relevant documents. How many of the top documents for this system were relevant and could they be categorized and distinguished from others?

In [103]:
from subprocess import check_output
import subprocess

# This question can be answered using the calculating the metrics used by the official trec_eval tool.
# https://www-nlpir.nist.gov/projects/trecvid/trecvid.tools/trec_eval_video/A.README
cmd = subprocess.Popen(['../anserini/tools/eval/trec_eval.9.0.4/trec_eval', '-c', '-mofficial', 'data/2019qrels-pass.txt', 'data/bm25-msmarco-test.trec'], stdout=subprocess.PIPE)
cmd_out, cmd_err = cmd.communicate()
print(cmd_out.decode("utf-8"))

runid                 	all	bm25
num_q                 	all	43
num_ret               	all	43000
num_rel               	all	4102
num_rel_ret           	all	2814
map                   	all	0.3774
gm_map                	all	0.2465
Rprec                 	all	0.3964
bpref                 	all	0.5000
recip_rank            	all	0.8245
iprec_at_recall_0.00  	all	0.8579
iprec_at_recall_0.10  	all	0.6687
iprec_at_recall_0.20  	all	0.5796
iprec_at_recall_0.30  	all	0.5073
iprec_at_recall_0.40  	all	0.4169
iprec_at_recall_0.50  	all	0.3689
iprec_at_recall_0.60  	all	0.3138
iprec_at_recall_0.70  	all	0.2596
iprec_at_recall_0.80  	all	0.1927
iprec_at_recall_0.90  	all	0.1161
iprec_at_recall_1.00  	all	0.0340
P_5                   	all	0.6930
P_10                  	all	0.6186
P_15                  	all	0.5798
P_20                  	all	0.5453
P_30                  	all	0.4953
P_100                 	all	0.3198
P_200                 	all	0.2266
P_500                 	all	0.1176
P_1000                	al

We can see that of the 4102 relevant documents, BM25 managed to find 2814. Furthermore, when we look at the precision metrics, we can see that the most relevant documents are found early on, as the precision at 10 retrieved documents is 60%, but that the precision at 100 retrieved documents is only 30%. This means that BM25 cannot make a clear distinction between relevant and non-relevant documents accross the retreived documents.

### 2) Behavior on top non-relevant documents Why were the top non-relevant documents retrieved? Behavior on unretrieved relevant documents Why weren’t these relevant documents retrieved within the top 1000?

In [104]:
query_id = qrels.query_id.unique()[3]
query = get_query(queries, query_id)
ranking = get_ranking_by_query_id(d, query_id)
relevant_doc_ids_not_retrieved = get_relevant_doc_ids_not_retrieved(qrels, query_id, ranking)
print(tokenize(query))
print(get_doc_vec(relevant_doc_ids_not_retrieved[0]))

['caus', 'militari', 'suicid']
{'half': 2, 'expos': 1, 'about': 2, 'american': 1, 'had': 1, 'injuri': 1, 'while': 1, 'riski': 1, 'when': 1, 'drug': 1, 'problem': 2, 'white': 1, 'than': 3, '52': 1, 'alcohol': 1, 'all': 2, 'trauma': 1, 'which': 1, 'like': 2, 'onli': 1, 'vietnam': 2, 'them': 1, 'develop': 1, 'seriou': 1, '1': 1, '2': 2, 'symptom': 2, '3': 1, 'million': 2, 'veteran': 3, '6': 1, '1.7': 2, '60': 1, 'women': 2, '27': 1, '28': 1, 'affect': 1, 'medic': 1, 'combat': 1, 'experienc': 2, 'seen': 1, 'abus': 2, 'men': 2, 'have': 5, 'behavior': 1, 'ptsd': 6, '35': 1, 'more': 4, 'health': 1, 'war': 1, 'also': 1, 'although': 1, 'due': 1, 'african': 1}


In this case, synonyms and similar terms for military such as 'veteran' and 'medic' could have helped in finding this document.

### 3) Base Query observations. What did the system think were the important terms of the original query, and were they good?

BM25 has no term weighing? It only removes non-important words.

### 4) Expanded Query observations. If the system expanded the query (4 out of 6 systems did), what were the important terms of the expansion, and were they helpful?

BM25 Uses no expanded queries.

### 5) Blunders of system. What obvious mistakes did the system make that it could have easily avoided? Examples might be bad stemming of words or bad handling of hyphenation. Other features of note. Anything else.

We can answer this question by looking at the queries with the worst recall.

In [105]:
recalls = [(k, v) for k, v in get_recall_per_query(qrels, d, 20).items()]
recalls.sort(key=lambda x: x[1])
worst_query_id = recalls[0][0]

print(get_query(queries, worst_query_id))
print(tokenize(get_query(queries, worst_query_id)))

relevant = get_relevant_doc_ids_not_retrieved(qrels, worst_query_id, get_ranking_by_query_id(d, worst_query_id))
print(get_doc_vec(relevant[0]))

worst_query_id = recalls[2][0]

print(get_query(queries, worst_query_id))
print(tokenize(get_query(queries, worst_query_id)))

relevant = get_relevant_doc_ids_not_retrieved(qrels, worst_query_id, get_ranking_by_query_id(d, worst_query_id))
print(index.doc(str(relevant[0])).raw())


what are the three percenters?
['what', 'three', 'percent']
{'militia': 1, 'refus': 1, 'govern': 1, 'subject': 1, 'own': 1, '09': 1, 'american': 1, 'british': 1, 'three': 1, 'second': 2, '3per': 1, 'loos': 1, 'bear': 1, 'from': 1, 'up': 1, 'took': 1, 'law': 1, 'king': 1, 'like': 1, 'amend': 2, 'revolutionari': 1, 'patriot': 1, 'onli': 1, 'come': 1, 'carri': 1, 'dure': 1, 'disarm': 1, '3': 3, 'constitut': 1, 'keep': 1, 'firearm': 1, 'name': 1, 'octob': 1, 'who': 1, 'fact': 1, 'against': 1, 'percent': 4, 'revolut': 1, 'we': 3, 'coloni': 1, 'violat': 1, 'arm': 1, 'win': 1, 'vow': 1, 'independ': 1, 'mind': 1, 'war': 1, 'enough': 1, 'right': 1, 'compli': 2, 'affili': 1, 'popul': 1, '2013': 1, 'overthrow': 1}
why did the us volunterilay enter ww1
['why', 'did', 'us', 'volunterilai', 'enter', 'ww1']
{
  "id" : "1048074",
  "contents" : "Woodrow Wilson also embarked on reorganising the federal banking system. From 1914 to 1917, he observed a strict neutrality in the Great War but the activitie

In this case, the problem with the query with the worst performance is that Percenter is reduced to percent, which now matches with any document using the word percent. This is an obvious stemming issue. This could be resolved by using NER information.

In the second query, a spelling mistake prevents good retrieval, and again synonyms for WW1 might be useful.

### 6) What should system to do improve performance? The individual’s conclusion as to why the system did not retrieve well, and recommendations as to what would have made a better retrieval.

### 7) What added information would help performance? How can system get that information? Is there implicit information in the query, that a human would understand but the system didn’t? Examples might be world knowledge (like Germany is part of Europe).

In general, similar terms and synonyms could benefit greatly in retrieval as queries are often times very small and might miss key terms. Also, a spell checker may benefit search as some queries showed spelling errors which prevents matching of the same intended word. 