In [11]:
%matplotlib inline

import pandas as pd
import numpy as np

from os.path import basename, join, splitext
from os import chdir
from glob import glob

INPUT_BASE_DIR = 'anserini_runs'
OUTPUT_DIR = join('analyses', 'single')

In [12]:
# This process should be executed process for each combination of collection,
# query formulation strategy and number of hits (top-k) #
corpus_suffix = ['eRisk18', 'eRisk19', 'clpsych15', 'clpsych15_ctrl']
bdi_suffix = ['bdi_optext']
hits_suffix = ['10', '1000']

# -- Anserini result file -- #
# Query Result File Format:
# * 1st column is the topic number (i.e., query ID)
# * 2nd column is currently unused and should always be "Q0".
# * 3rd column is the official document identifier of the retrieved document.
# * 4th column is the rank the document is retrieved.
# * 5th column shows the score (integer or floating point) that generated the ranking.
# * 6th column is called the "run tag" and should be a unique identifier for your system.
# Separated by a single **space**
# 1 Q0 413516 1 5.086600 Anserini #
trec_eval_fmt_cols = ['qid', 'unused', 'docid', 'rank', 'score', 'runtag']

# Retrieval Models used #
retrieval_models = ['bm25', 'qld', 'bm25+rm3', 'qld+rm3', 'bm25+bm25prf']

operator = 'should'
grain = 'single'

# Select the corresponding index based on the desired analysis #
corpus = corpus_suffix[0]
bdi = bdi_suffix[1]
hits = hits_suffix[2]

path = join(INPUT_BASE_DIR, grain, operator, bdi, corpus, hits, '*')
print('Listing: %s' % path)

ls = sorted(glob(path))

Listing: anserini_runs/single/should/bdi_optext/eRisk18/1000/*


In [13]:
# Maximum number of queries #
# One query per BDI item (Total 21) #
max_qid = 22
retrieval_prf_mean = {}
result_set_size = {}

# **Incidence Score**: Compute the average relevance score of the documents 
# retrieved for each BDI item based on a specific ranking function #
# Thus, each row of _retrieval_prf_mean_ contains the average score of item "i", 
# for different ranking functions #
for filename in ls:
    print('Processing: %s' % filename)
    # Parse filename #
    rkg_mdl, _operator, _hits, _bdi = splitext(basename(filename))[0].split('_')
    # Read result file (ssv) #
    ranking = pd.read_csv(filename, header=None, sep=' ')
    ranking.columns = trec_eval_fmt_cols
    ranking.drop(['unused', 'runtag'], axis=1, inplace=True)
    q_mean = np.zeros(max_qid, dtype=np.float64)
    q_result_set_size = np.zeros(max_qid, dtype=np.int32)
    for i in range(1, max_qid):
        q = ranking.loc[ranking.qid == i]
        q_mean[i] = q.score.mean()
        q_result_set_size[i] = q.shape[0]
    retrieval_prf_mean[rkg_mdl] = q_mean
    result_set_size[rkg_mdl] = q_result_set_size

Processing: anserini_runs/single/should/bdi_optext/eRisk18/1000/bm25+bm25prf_OR_1000_optext-bdi.txt
Processing: anserini_runs/single/should/bdi_optext/eRisk18/1000/bm25+rm3_OR_1000_optext-bdi.txt


Processing: anserini_runs/single/should/bdi_optext/eRisk18/1000/bm25_OR_1000_optext-bdi.txt
Processing: anserini_runs/single/should/bdi_optext/eRisk18/1000/qld+rm3_OR_1000_optext-bdi.txt
Processing: anserini_runs/single/should/bdi_optext/eRisk18/1000/qld_OR_1000_optext-bdi.txt


In [14]:
filepath = [join(OUTPUT_DIR, 'retrieval_prf_mean_%s_%s_%s.tsv' % (operator, hits, bdi)),
            join(OUTPUT_DIR, 'result_set_size_%s_%s_%s.tsv' % (operator, hits, bdi))]
pd.DataFrame.from_dict(retrieval_prf_mean).drop(index=0).to_csv(filepath[0], 
                                                                index=True, 
                                                                header=True, 
                                                                sep='\t')
pd.DataFrame.from_dict(result_set_size).drop(index=0).to_csv(filepath[1], 
                                                             index=True, 
                                                             header=True, 
                                                             sep='\t')
retrieval_prf = pd.read_csv(filepath[0], header=0, index_col=0, sep='\t')
result_set_size = pd.read_csv(filepath[1], header=0, index_col=0, sep='\t')

In [15]:
# Based on these incidence scores, sort the BDI items creating an ordered list 
# of the 21 elements.
rankings = np.zeros((max_qid - 1, len(retrieval_models)), dtype=np.int32)
hits_by_query = np.zeros((max_qid - 1, len(retrieval_models)), dtype=np.int32)
for idx, rm in enumerate(retrieval_models):
    # For each ranking function (column in _retrieval_prf_), sort the row based
    # on the incidence score. Keep the index of the row it represents the item
    # index.
    sorted_idxs = retrieval_prf.loc[:, rm].sort_values(ascending=False).index
    rankings[:, idx] = sorted_idxs.values
    hits_by_query[:, idx] = result_set_size.loc[sorted_idxs, rm]
questions_ranking = pd.DataFrame(rankings, columns=retrieval_models)
paired_result_set_size = pd.DataFrame(hits_by_query, columns=retrieval_models)
questions_ranking.to_csv(join(OUTPUT_DIR, 'ranking.tsv'), header=True, sep='\t')
paired_result_set_size.to_csv(join(OUTPUT_DIR, 'hits.tsv'), header=True, sep='\t')

# BDI Items are now sorted based on the incidence score, thus at row zero you
# can find the BDI item with the highest incidence (item's index). Equivalently, 
# at row twenty, you can find the item with the lowest incidence #