# Imports

## Imports and Setup

In [26]:
import pathlib
import sys
import os

# In Jupyter notebooks, __file__ is not defined
# Instead, use the current working directory to modify the path
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)
print(f"Added {project_root} to Python path")

import torch
import numpy as np
import pyterrier as pt
from pathlib import Path

# Initialize PyTerrier
if not pt.started():
    pt.init()

from pyterrier.datasets import Dataset
from pyterrier.measures import *
from fast_forward.encoder import TASBEncoder, ContrieverEncoder
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFInterpolate, FFScore
from pyterrier.terrier import Retriever

from fusions.FFTM2C2 import FFTM2C2
from fusions.experiment import fuse_convex_norm

device="cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")


Added /home/weicheng/ir-project to Python path
Using device: cuda:0
PyTorch version: 2.6.0+cu124


  if not pt.started():


# Dataset Selection

In [27]:
# Dataset Selection: https://pyterrier.readthedocs.io/en/latest/datasets.html
dataset_name = "irds:beir/scifact"
dataset = pt.get_dataset(dataset_name)
testset = pt.get_dataset(dataset_name + "/test")

# Indexing
indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

[INFO] [starting] building docstore
[INFO] [starting] opening zip file                                              
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip
                                                                                
[A                                                                                                                       [INFO] [finished] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip: [00:01] [2.82MB] [1.47MB/s]
[INFO] [finished] opening zip file [2.48s]                                      
docs_iter: 100%|█████████████████████████| 5183/5183 [00:02<00:00, 2002.38doc/s]
[INFO] [finished] docs_iter: [00:02] [5183doc] [2001.95doc/s]
[INFO] [finished] building docstore [2.59s]
beir/scifact documents: 100%|██████████| 5183/5183 [00:01<00:00, 3318.27it/s]


# Model Configuration

Setting up three retrieval models:
1. BM25 - Classic lexical retrieval
2. TASB - Neural retriever model
3. Contriever - Neural retriever model

In [28]:
# BM25
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tasb_q_encoder =  tasb_d_encoder = TASBEncoder(device=device)
con_q_encoder = con_d_encoder = ContrieverEncoder(device=device)

In [29]:
safe_dataset_name = dataset_name.replace(":", "_").replace("/", "_")
# Define index paths for both models
tasb_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_tasb.h5")
con_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_con.h5")

def load_or_create_index(index_path: pathlib.Path, q_encoder, d_encoder):
    print(index_path.exists())
    try:
        ff_index = OnDiskIndex.load(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
    except FileNotFoundError:
        index_path.parent.mkdir(exist_ok=True, parents=True)
        ff_index = OnDiskIndex(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
        from fast_forward.util import Indexer

        def docs_iter():
            for d in dataset.get_corpus_iter():
                yield {"doc_id": d["docno"], "text": d["text"]}

        Indexer(ff_index, d_encoder, batch_size=8).from_dicts(docs_iter())

    return ff_index.to_memory()

tasb_index = load_or_create_index(tasb_index_path, tasb_q_encoder, tasb_d_encoder)
con_index = load_or_create_index(con_index_path, con_q_encoder, con_d_encoder)



True


100%|██████████| 5183/5183 [00:00<00:00, 1823289.24it/s]




True


100%|██████████| 5183/5183 [00:00<00:00, 2253921.99it/s]


## Create Retrieval Pipelines

We create three pipelines:
1. BM25 only
2. BM25 re-ranked with TASB
3. BM25 re-ranked with Contriever

In [30]:
ff_tasb = FFScore(tasb_index)
ff_con = FFScore(con_index)
RANK_CUTOFF = 50  # Number of documents to retrieve with BM25 before re-ranking

# Define retrieval pipelines
pipeline_0 = (bm25 % RANK_CUTOFF)  # BM25 only
pipeline_1 = bm25 % RANK_CUTOFF >> ff_tasb  # BM25 + TASB re-ranking
pipeline_2 = bm25 % RANK_CUTOFF >> ff_con  # BM25 + Contriever re-ranking


In [31]:

def get_pipeline_result(pipeline: Retriever, ds: Dataset):
    return pipeline.transform(ds.get_topics())

res_1 = get_pipeline_result(pipeline_0,testset)
res_2 = get_pipeline_result(pipeline_1,testset)
res_3 = get_pipeline_result(pipeline_2,testset)




[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [1ms]
[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [0ms]


## Model Fusion

Combine results from the three models using convex normalization,
with weights 0.2 for BM25, 0.4 for TASB, and 0.4 for Contriever.

In [32]:
fuse_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="min_max",
    normalization_method_3="min_max",
)

fuse_t_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="theoretical_min_max",
    normalization_method_3="theoretical_min_max",
)

fuse_z_score = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="z_score",
    normalization_method_3="z_score",
)

fuse_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_t_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_z_score_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_clean = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="unnormalized",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["rank"] = df.groupby("qid", sort=False)["score"].rank(ascending=False, method="first").astype(int) -1 + FIRST_RANK
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

## Evaluation Results

Compare performance of individual models vs fusion approach

In [33]:
pt.Experiment(

    [res_1,res_2,res_3,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    fuse_clean],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=['bm25', 'bm25+TASB', 'bm25+Cont', 
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           'no_normalization_fusion'],
    baseline=0,
    correction="bonferroni",
    save_dir="./results",
)



Unnamed: 0,name,nDCG@10,AP@100,RR@10,nDCG@10 +,nDCG@10 -,nDCG@10 p-value,nDCG@10 reject,nDCG@10 p-value corrected,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@10 +,RR@10 -,RR@10 p-value,RR@10 reject,RR@10 p-value corrected
0,bm25,0.672167,0.626235,0.632427,,,,False,,,,,False,,,,,False,
1,bm25+TASB,0.64309,0.604246,0.611757,64.0,65.0,0.081834,False,0.736508,69.0,71.0,0.231077,False,1.0,56.0,64.0,0.280644,False,1.0
2,bm25+Cont,0.664668,0.609543,0.622169,67.0,66.0,0.64059,False,1.0,70.0,67.0,0.347563,False,1.0,62.0,65.0,0.587228,False,1.0
3,min_max_fusion,0.690287,0.650008,0.659679,68.0,42.0,0.162625,False,1.0,72.0,46.0,0.095701,False,0.861312,61.0,42.0,0.075014,False,0.675128
4,theoretical_min_max_fusion,0.700102,0.658921,0.666589,67.0,34.0,0.016661,False,0.149951,70.0,38.0,0.009574,False,0.086167,60.0,34.0,0.011358,False,0.102226
5,z_score_fusion,0.699981,0.659639,0.667107,67.0,33.0,0.014863,False,0.13377,71.0,36.0,0.007277,False,0.065489,60.0,33.0,0.009121,False,0.082086
6,min_max_fusion_lexical_only,0.6477,0.609164,0.616164,66.0,63.0,0.132066,False,1.0,71.0,69.0,0.339675,False,1.0,58.0,62.0,0.383507,False,1.0
7,theoretical_min_max_fusion_lexical_only,0.6477,0.609164,0.616164,66.0,63.0,0.132066,False,1.0,71.0,69.0,0.339675,False,1.0,58.0,62.0,0.383507,False,1.0
8,z_score_fusion_lexical_only,0.666123,0.629208,0.636914,66.0,56.0,0.686103,False,1.0,72.0,62.0,0.85777,False,1.0,58.0,55.0,0.795848,False,1.0
9,no_normalization_fusion,0.689202,0.648462,0.655726,58.0,29.0,0.078741,False,0.708667,66.0,34.0,0.038208,False,0.343874,48.0,28.0,0.039319,False,0.353873


Use unnormalized fusion of 3 models as baseline here:

In [34]:
result = pt.Experiment(

    [fuse_clean,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    ],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=['no_normalization_fusion',
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           ],
    baseline=0,
    correction="bonferroni",
    save_dir="./results",
)

result.to_csv(f'results/{safe_dataset_name}output.csv', index=False)

result


Unnamed: 0,name,nDCG@10,AP@100,RR@10,nDCG@10 +,nDCG@10 -,nDCG@10 p-value,nDCG@10 reject,nDCG@10 p-value corrected,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@10 +,RR@10 -,RR@10 p-value,RR@10 reject,RR@10 p-value corrected
0,no_normalization_fusion,0.689202,0.648462,0.655726,,,,False,,,,,False,,,,,False,
1,min_max_fusion,0.690287,0.650008,0.659679,43.0,38.0,0.885804,False,1.0,53.0,44.0,0.856972,False,1.0,36.0,37.0,0.677079,False,1.0
2,theoretical_min_max_fusion,0.700102,0.658921,0.666589,43.0,30.0,0.12989,False,0.77934,51.0,34.0,0.200693,False,1.0,35.0,28.0,0.213537,False,1.0
3,z_score_fusion,0.699981,0.659639,0.667107,42.0,28.0,0.098898,False,0.593388,53.0,34.0,0.138348,False,0.830089,34.0,27.0,0.161642,False,0.969851
4,min_max_fusion_lexical_only,0.6477,0.609164,0.616164,40.0,59.0,0.000417,True,0.002504,45.0,73.0,0.002872,True,0.017231,34.0,57.0,0.004123,True,0.024735
5,theoretical_min_max_fusion_lexical_only,0.6477,0.609164,0.616164,40.0,59.0,0.000417,True,0.002504,45.0,73.0,0.002872,True,0.017231,34.0,57.0,0.004123,True,0.024735
6,z_score_fusion_lexical_only,0.666123,0.629208,0.636914,37.0,52.0,0.020237,False,0.12142,42.0,67.0,0.088589,False,0.531532,31.0,50.0,0.11362,False,0.681719
