# Imports

## Imports and Setup

In [18]:
import pathlib
import sys
import os

# In Jupyter notebooks, __file__ is not defined
# Instead, use the current working directory to modify the path
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)
print(f"Added {project_root} to Python path")

import torch
import numpy as np
import pyterrier as pt
from pathlib import Path

# Initialize PyTerrier
if not pt.started():
    pt.init()

from pyterrier.datasets import Dataset
from pyterrier.measures import *
from fast_forward.encoder import TASBEncoder, ContrieverEncoder
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFInterpolate, FFScore
from pyterrier.terrier import Retriever

from fusions.FFTM2C2 import FFTM2C2
from fusions.experiment import fuse_convex_norm

device="cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")


Added /home/weicheng/ir-project to Python path
Using device: cuda:0
PyTorch version: 2.6.0+cu124


  if not pt.started():


# Dataset Selection

In [19]:
# Dataset Selection: https://pyterrier.readthedocs.io/en/latest/datasets.html
dataset_name = "irds:beir/fiqa"
dataset = pt.get_dataset(dataset_name)
testset = pt.get_dataset(dataset_name + "/test")

# Indexing
indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

beir/fiqa documents: 100%|██████████| 57638/57638 [00:07<00:00, 7256.75it/s]


# Model Configuration

Setting up three retrieval models:
1. BM25 - Classic lexical retrieval
2. TASB - Neural retriever model
3. Contriever - Neural retriever model

In [20]:
# BM25
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tasb_q_encoder =  tasb_d_encoder = TASBEncoder(device=device)
con_q_encoder = con_d_encoder = ContrieverEncoder(device=device)

In [21]:
safe_dataset_name = dataset_name.replace(":", "_").replace("/", "_")
# Define index paths for both models
tasb_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_tasb.h5")
con_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_con.h5")

def load_or_create_index(index_path: pathlib.Path, q_encoder, d_encoder):
    print(index_path.exists())
    try:
        ff_index = OnDiskIndex.load(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
    except FileNotFoundError:
        index_path.parent.mkdir(exist_ok=True, parents=True)
        ff_index = OnDiskIndex(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
        from fast_forward.util import Indexer

        def docs_iter():
            for d in dataset.get_corpus_iter():
                yield {"doc_id": d["docno"], "text": d["text"]}

        Indexer(ff_index, d_encoder, batch_size=8).from_dicts(docs_iter())

    return ff_index.to_memory()

tasb_index = load_or_create_index(tasb_index_path, tasb_q_encoder, tasb_d_encoder)
con_index = load_or_create_index(con_index_path, con_q_encoder, con_d_encoder)



True


100%|██████████| 57638/57638 [00:00<00:00, 2437574.17it/s]


True


100%|██████████| 57638/57638 [00:00<00:00, 2669397.27it/s]


## Create Retrieval Pipelines

We create three pipelines:
1. BM25 only
2. BM25 re-ranked with TASB
3. BM25 re-ranked with Contriever

In [22]:
ff_tasb = FFScore(tasb_index)
ff_con = FFScore(con_index)
RANK_CUTOFF = 50  # Number of documents to retrieve with BM25 before re-ranking

# Define retrieval pipelines
pipeline_0 = (bm25 % RANK_CUTOFF)  # BM25 only
pipeline_1 = bm25 % RANK_CUTOFF >> ff_tasb  # BM25 + TASB re-ranking
pipeline_2 = bm25 % RANK_CUTOFF >> ff_con  # BM25 + Contriever re-ranking


In [23]:

def get_pipeline_result(pipeline: Retriever, ds: Dataset):
    return pipeline.transform(ds.get_topics())

res_1 = get_pipeline_result(pipeline_0,testset)
res_2 = get_pipeline_result(pipeline_1,testset)
res_3 = get_pipeline_result(pipeline_2,testset)




## Model Fusion

Combine results from the three models using convex normalization,
with weights 0.2 for BM25, 0.4 for TASB, and 0.4 for Contriever.

In [24]:
fuse_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="min_max",
    normalization_method_3="min_max",
)

fuse_t_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="theoretical_min_max",
    normalization_method_3="theoretical_min_max",
)

fuse_z_score = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="z_score",
    normalization_method_3="z_score",
)

fuse_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_t_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_z_score_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_clean = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="unnormalized",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["rank"] = df.groupby("qid", sort=False)["score"].rank(ascending=False, method="first").astype(int) -1 + FIRST_RANK
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

## Evaluation Results

Compare performance of individual models vs fusion approach

In [None]:
pt.Experiment(

    [res_1,res_2,res_3,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    fuse_clean],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 100, nDCG @ 100, MAP @ 100],
    names=['bm25', 'bm25+TASB', 'bm25+Cont', 
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           'no_normalization_fusion'],
    baseline=0,
    correction="bonferroni",
    save_dir="./results",
)



Unnamed: 0,name,nDCG@10,AP@100,RR@10,nDCG@10 +,nDCG@10 -,nDCG@10 p-value,nDCG@10 reject,nDCG@10 p-value corrected,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@10 +,RR@10 -,RR@10 p-value,RR@10 reject,RR@10 p-value corrected
0,bm25,0.252589,0.20664,0.310271,,,,False,,,,,False,,,,,False,
1,bm25+TASB,0.306769,0.250428,0.371344,204.0,111.0,6.801317e-09,True,6.121185e-08,231.0,142.0,5.456308e-07,True,4.910677e-06,170.0,96.0,3.30243e-06,True,2.972187e-05
2,bm25+Cont,0.278087,0.223183,0.34273,189.0,132.0,0.008707841,False,0.07837057,228.0,156.0,0.06522488,False,0.5870239,161.0,113.0,0.01663045,False,0.1496741
3,min_max_fusion,0.319821,0.261455,0.393266,209.0,81.0,1.019503e-15,True,9.175524e-15,255.0,101.0,2.384721e-12,True,2.146249e-11,173.0,66.0,6.024211e-12,True,5.421789e-11
4,theoretical_min_max_fusion,0.31426,0.254076,0.384178,213.0,69.0,3.508218e-16,True,3.157396e-15,256.0,94.0,5.905687e-12,True,5.315118e-11,174.0,54.0,9.746911e-12,True,8.77222e-11
5,z_score_fusion,0.320009,0.260463,0.393372,214.0,78.0,2.289333e-16,True,2.0604e-15,257.0,97.0,1.758275e-12,True,1.582447e-11,175.0,65.0,2.2923e-12,True,2.06307e-11
6,min_max_fusion_lexical_only,0.309522,0.252317,0.374074,207.0,110.0,8.627872e-10,True,7.765085e-09,232.0,141.0,1.211447e-07,True,1.090302e-06,173.0,94.0,9.530671e-07,True,8.577604e-06
7,theoretical_min_max_fusion_lexical_only,0.309565,0.252361,0.374111,207.0,110.0,8.406669e-10,True,7.566002e-09,232.0,141.0,1.181188e-07,True,1.063069e-06,173.0,94.0,9.394469e-07,True,8.455023e-06
8,z_score_fusion_lexical_only,0.312643,0.256742,0.381692,208.0,93.0,9.540188e-12,True,8.586169e-11,239.0,123.0,9.073385e-10,True,8.166046e-09,172.0,79.0,8.304862e-09,True,7.474375e-08
9,no_normalization_fusion,0.312667,0.254689,0.378361,205.0,68.0,2.7653410000000002e-17,True,2.488807e-16,254.0,95.0,2.145978e-12,True,1.93138e-11,163.0,55.0,1.872283e-12,True,1.685055e-11


Use unnormalized fusion of 3 models as baseline here:

In [26]:
result = pt.Experiment(

    [fuse_clean,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    pipeline_0,
    pipeline_1,
    pipeline_2,
    ],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[R@10, R@100, RR @ 100, nDCG @ 100, MAP @ 100],
    names=['no_normalization_fusion',
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           'BM25',
            'BM25+TASB',
            'BM25+Cont'
           ],
    baseline=0,
    correction="bonferroni",
)

result.to_csv(f'results/{safe_dataset_name}_output.csv', index=False)

result


Unnamed: 0,name,R@10,R@100,nDCG@100,AP@100,RR@100,R@10 +,R@10 -,R@10 p-value,R@10 reject,...,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@100 +,RR@100 -,RR@100 p-value,RR@100 reject,RR@100 p-value corrected
0,no_normalization_fusion,0.378606,0.485482,0.343885,0.254689,0.38484,,,,False,...,,,,False,,,,,False,
1,min_max_fusion,0.379276,0.485482,0.350686,0.261455,0.399405,39.0,35.0,0.9262124,False,...,180.0,129.0,0.1249552,False,1.0,131.0,98.0,0.05171275,False,0.4654147
2,theoretical_min_max_fusion,0.379612,0.485482,0.344824,0.254076,0.389996,42.0,37.0,0.8924883,False,...,168.0,146.0,0.8944222,False,1.0,122.0,110.0,0.4987473,False,1.0
3,z_score_fusion,0.381836,0.485482,0.350018,0.260463,0.399106,42.0,35.0,0.6580028,False,...,179.0,129.0,0.1780375,False,1.0,130.0,96.0,0.05766141,False,0.5189527
4,min_max_fusion_lexical_only,0.376435,0.485482,0.341369,0.252317,0.37996,29.0,37.0,0.7377426,False,...,138.0,157.0,0.6019936,False,1.0,111.0,122.0,0.5019264,False,1.0
5,theoretical_min_max_fusion_lexical_only,0.376435,0.485482,0.341404,0.252361,0.379996,29.0,37.0,0.7377426,False,...,139.0,157.0,0.6088809,False,1.0,112.0,122.0,0.5051579,False,1.0
6,z_score_fusion_lexical_only,0.374506,0.485482,0.345305,0.256742,0.388083,22.0,32.0,0.4793261,False,...,138.0,142.0,0.5881552,False,1.0,107.0,106.0,0.602941,False,1.0
7,BM25,0.309708,0.485482,0.301535,0.20664,0.320309,24.0,109.0,1.065962e-11,True,...,95.0,254.0,2.145978e-12,True,1.93138e-11,78.0,203.0,6.786087e-12,True,6.107478e-11
8,BM25+TASB,0.372474,0.485482,0.339768,0.250428,0.377691,25.0,39.0,0.3311288,False,...,135.0,162.0,0.3737192,False,1.0,107.0,125.0,0.3411067,False,1.0
9,BM25+Cont,0.344599,0.485482,0.318383,0.223183,0.350281,50.0,91.0,0.001071167,True,...,147.0,221.0,0.0001685135,True,0.001516622,121.0,185.0,0.004332666,True,0.038994
