In [2]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
ensure_pyterrier_is_loaded()

import pyterrier as pt
from ranx import Run, fuse
import pandas as pd

In [3]:
for d in ['tot-test', 'tot-train', 'tot-dev']:
    runs = [Run.from_file(f'../webis-bm25r-{i}/{d}/run.txt') for i in ['1', '2', '3']]
    combined_run = fuse(
        runs=runs,            # A list of Run instances to fuse
        norm="min-max",       # The normalization strategy to apply before fusion
        method="rrf",         # The fusion algorithm to use 
    )
    !mkdir -p {d}
    combined_run.save(f'{d}/run.txt')
    run = pt.io.read_results(f'{d}/run.txt', format='trec')
    persist_and_normalize_run(run, 'webis-bm25rf', f'{d}/run.txt')

### Final test to verify that the scores are identical to the hyperparameter tuning.

In [4]:
def report_effectiveness(dataset):
    queries = pt.io.read_topics(f'../../{dataset}/queries.xml', 'trecxml')
    run = pt.Transformer.from_df(pt.io.read_results(f'{dataset}/run.txt'))

    ret = pt.Experiment([run], queries, pt.io.read_qrels(f'../../{dataset}/qrels.txt'), eval_metrics=["ndcg_cut_10", 'recip_rank', 'P_10', 'recall_100', 'recall_1000', 'recall_5000'])
    
    return {i: ret.iloc[0][i] for i in ['ndcg_cut_10', 'recip_rank', 'P_10', 'recall_100', 'recall_1000', 'recall_5000']}

def report_pipeline():
    ret = {}
    for dataset in ['tot-train', 'tot-dev']:
        for k,v in report_effectiveness(dataset).items():
            ret[dataset.split('-')[1] + '_' + k] = v
    return pd.DataFrame([ret])

report_pipeline()

Unnamed: 0,train_ndcg_cut_10,train_recip_rank,train_P_10,train_recall_100,train_recall_1000,train_recall_5000,dev_ndcg_cut_10,dev_recip_rank,dev_P_10,dev_recall_100,dev_recall_1000,dev_recall_5000
0,0.099582,0.094137,0.013333,0.293333,0.506667,0.506667,0.09877,0.096233,0.012667,0.273333,0.493333,0.493333
