# Imports

## Imports and Setup

In [1]:
import pathlib
import sys
import os

# In Jupyter notebooks, __file__ is not defined
# Instead, use the current working directory to modify the path
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
sys.path.insert(0, project_root)
print(f"Added {project_root} to Python path")

import torch
import numpy as np
import pyterrier as pt
from pathlib import Path

# Initialize PyTerrier
if not pt.started():
    pt.init()

from pyterrier.datasets import Dataset
from pyterrier.measures import *
from fast_forward.encoder import TASBEncoder, ContrieverEncoder
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFInterpolate, FFScore
from pyterrier.terrier import Retriever

from fusions.FFTM2C2 import FFTM2C2
from fusions.experiment import fuse_convex_norm

device="cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")


Added /home/weicheng/ir-project to Python path


  if not pt.started():
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()
  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:0
PyTorch version: 2.6.0+cu124


# Dataset Selection

In [2]:
# Dataset Selection: https://pyterrier.readthedocs.io/en/latest/datasets.html
dataset_name = "irds:beir/quora"
dataset = pt.get_dataset(dataset_name)
testset = pt.get_dataset(dataset_name + "/test")

# Indexing
indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

beir/quora documents: 100%|██████████| 522931/522931 [00:14<00:00, 37190.95it/s]


# Model Configuration

Setting up three retrieval models:
1. BM25 - Classic lexical retrieval
2. TASB - Neural retriever model
3. Contriever - Neural retriever model

In [3]:
# BM25
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tasb_q_encoder =  tasb_d_encoder = TASBEncoder(device=device)
con_q_encoder = con_d_encoder = ContrieverEncoder(device=device)

In [4]:
safe_dataset_name = dataset_name.replace(":", "_").replace("/", "_")
# Define index paths for both models
tasb_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_tasb.h5")
con_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_con.h5")

def load_or_create_index(index_path: pathlib.Path, q_encoder, d_encoder):
    print(index_path.exists())
    try:
        ff_index = OnDiskIndex.load(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
    except FileNotFoundError:
        index_path.parent.mkdir(exist_ok=True, parents=True)
        ff_index = OnDiskIndex(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
        from fast_forward.util import Indexer

        def docs_iter():
            for d in dataset.get_corpus_iter():
                yield {"doc_id": d["docno"], "text": d["text"]}

        Indexer(ff_index, d_encoder, batch_size=8).from_dicts(docs_iter())

    return ff_index.to_memory()

tasb_index = load_or_create_index(tasb_index_path, tasb_q_encoder, tasb_d_encoder)
con_index = load_or_create_index(con_index_path, con_q_encoder, con_d_encoder)



True


100%|██████████| 522931/522931 [00:00<00:00, 706503.03it/s]


True


100%|██████████| 522931/522931 [00:00<00:00, 777637.90it/s]


## Create Retrieval Pipelines

We create three pipelines:
1. BM25 only
2. BM25 re-ranked with TASB
3. BM25 re-ranked with Contriever

In [5]:
ff_tasb = FFScore(tasb_index)
ff_con = FFScore(con_index)
RANK_CUTOFF = 50  # Number of documents to retrieve with BM25 before re-ranking

# Define retrieval pipelines
pipeline_0 = (bm25 % RANK_CUTOFF)  # BM25 only
pipeline_1 = bm25 % RANK_CUTOFF >> ff_tasb  # BM25 + TASB re-ranking
pipeline_2 = bm25 % RANK_CUTOFF >> ff_con  # BM25 + Contriever re-ranking


In [6]:

def get_pipeline_result(pipeline: Retriever, ds: Dataset):
    return pipeline.transform(ds.get_topics())

res_1 = get_pipeline_result(pipeline_0,testset)
res_2 = get_pipeline_result(pipeline_1,testset)
res_3 = get_pipeline_result(pipeline_2,testset)




## Model Fusion

Combine results from the three models using convex normalization,
with weights 0.2 for BM25, 0.4 for TASB, and 0.4 for Contriever.

In [7]:
fuse_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="min_max",
    normalization_method_3="min_max",
)

fuse_t_min_max = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="theoretical_min_max",
    normalization_method_3="theoretical_min_max",
)

fuse_z_score = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="z_score",
    normalization_method_3="z_score",
)

fuse_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_t_min_max_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="theoretical_min_max",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_z_score_lexical = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="z_score",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)

fuse_clean = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="unnormalized",
    normalization_method_2="unnormalized",
    normalization_method_3="unnormalized",
)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["rank"] = df.groupby("qid", sort=False)["score"].rank(ascending=False, method="first").astype(int) -1 + FIRST_RANK
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_i

## Evaluation Results

Compare performance of individual models vs fusion approach

In [8]:
pt.Experiment(

    [res_1,res_2,res_3,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    fuse_clean],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 100, nDCG @ 100, MAP @ 100],
    names=['bm25', 'bm25+TASB', 'bm25+Cont', 
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           'no_normalization_fusion'],
    baseline=0,
    correction="bonferroni",
    save_dir="./results",
)



Unnamed: 0,name,nDCG@100,AP@100,RR@100,nDCG@100 +,nDCG@100 -,nDCG@100 p-value,nDCG@100 reject,nDCG@100 p-value corrected,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@100 +,RR@100 -,RR@100 p-value,RR@100 reject,RR@100 p-value corrected
0,bm25,0.789557,0.726441,0.761615,,,,False,,,,,False,,,,,False,
1,bm25+TASB,0.837675,0.78825,0.821912,2737.0,1380.0,1.613073e-90,True,1.451766e-89,2743.0,1373.0,8.301998e-90,True,7.471797999999999e-89,2227.0,1144.0,9.334294e-77,True,8.400865e-76
2,bm25+Cont,0.843315,0.79531,0.827909,2795.0,1366.0,3.876069e-107,True,3.488462e-106,2805.0,1351.0,1.703779e-105,True,1.533401e-104,2286.0,1136.0,4.250667e-87,True,3.8256e-86
3,min_max_fusion,0.861969,0.820012,0.852484,2927.0,828.0,2.8379889999999998e-239,True,2.55419e-238,2942.0,813.0,9.088902e-240,True,8.180012e-239,2345.0,670.0,2.778009e-200,True,2.5002079999999998e-199
4,theoretical_min_max_fusion,0.863396,0.821913,0.853896,2949.0,713.0,7.730304000000001e-267,True,6.957273999999999e-266,2964.0,696.0,1.124486e-267,True,1.0120369999999999e-266,2345.0,586.0,2.1326459999999999e-221,True,1.919382e-220
5,z_score_fusion,0.863352,0.821748,0.854227,2950.0,694.0,8.839981e-272,True,7.955983e-271,2966.0,677.0,4.488187e-272,True,4.039368e-271,2341.0,560.0,1.610115e-227,True,1.449103e-226
6,min_max_fusion_lexical_only,0.839855,0.79108,0.824713,2754.0,1350.0,1.5054350000000001e-99,True,1.354891e-98,2758.0,1345.0,8.049663000000001e-99,True,7.244697e-98,2235.0,1111.0,1.441948e-84,True,1.297753e-83
7,theoretical_min_max_fusion_lexical_only,0.839839,0.791053,0.824703,2754.0,1352.0,1.7851530000000002e-99,True,1.606638e-98,2758.0,1347.0,9.965405e-99,True,8.968865e-98,2235.0,1112.0,1.555826e-84,True,1.400243e-83
8,z_score_fusion_lexical_only,0.844337,0.796901,0.83038,2780.0,1205.0,2.3732180000000002e-126,True,2.1358960000000001e-125,2784.0,1200.0,1.188965e-125,True,1.0700679999999999e-124,2239.0,974.0,2.224107e-107,True,2.0016960000000002e-106
9,no_normalization_fusion,0.850708,0.805043,0.838726,2804.0,835.0,3.3680840000000005e-206,True,3.031275e-205,2810.0,828.0,2.436857e-204,True,2.193171e-203,2217.0,658.0,4.585937e-173,True,4.127343e-172


Use unnormalized fusion of 3 models as baseline here:

In [9]:
result = pt.Experiment(

    [fuse_clean,
    fuse_min_max, 
    fuse_t_min_max, 
    fuse_z_score, 
    fuse_min_max_lexical,
    fuse_t_min_max_lexical, 
    fuse_z_score_lexical,
    pipeline_0,
    pipeline_1,
    pipeline_2,
    ],

    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[R@10, R@100, RR @ 100, nDCG @ 100, MAP @ 100],
    names=['no_normalization_fusion',
           'min_max_fusion', 
           'theoretical_min_max_fusion', 
           'z_score_fusion', 
           'min_max_fusion_lexical_only', 
           'theoretical_min_max_fusion_lexical_only', 
           'z_score_fusion_lexical_only',
           'BM25',
           'BM25+TASB',
           'BM25+Cont'
           ],
    baseline=0,
    correction="bonferroni",
)

result.to_csv(f'results/{safe_dataset_name}_output.csv', index=False)

result


Unnamed: 0,name,R@10,R@100,nDCG@100,AP@100,RR@100,R@10 +,R@10 -,R@10 p-value,R@10 reject,...,AP@100 +,AP@100 -,AP@100 p-value,AP@100 reject,AP@100 p-value corrected,RR@100 +,RR@100 -,RR@100 p-value,RR@100 reject,RR@100 p-value corrected
0,no_normalization_fusion,0.917767,0.950484,0.850708,0.805043,0.838726,,,,False,...,,,,False,,,,,False,
1,min_max_fusion,0.924249,0.950484,0.861969,0.820012,0.852484,201.0,68.0,1.320978e-11,True,...,1442.0,713.0,2.3539640000000002e-29,True,2.118567e-28,966.0,473.0,2.60345e-21,True,2.343105e-20
2,theoretical_min_max_fusion,0.925118,0.950484,0.863396,0.821913,0.853896,219.0,75.0,2.219738e-13,True,...,1441.0,673.0,1.430656e-37,True,1.2875909999999999e-36,963.0,437.0,4.5182659999999995e-26,True,4.0664400000000003e-25
3,z_score_fusion,0.924831,0.950484,0.863352,0.821748,0.854227,208.0,72.0,2.677051e-13,True,...,1417.0,641.0,1.825599e-39,True,1.6430389999999998e-38,943.0,402.0,3.812394e-29,True,3.4311540000000004e-28
4,min_max_fusion_lexical_only,0.909324,0.950484,0.839855,0.79108,0.824713,108.0,219.0,6.143901e-13,True,...,870.0,1369.0,9.633955e-24,True,8.67056e-23,567.0,961.0,1.755215e-20,True,1.5796929999999998e-19
5,theoretical_min_max_fusion_lexical_only,0.909274,0.950484,0.839839,0.791053,0.824703,108.0,220.0,4.709883e-13,True,...,870.0,1371.0,8.027926e-24,True,7.225133000000001e-23,567.0,962.0,1.65325e-20,True,1.4879249999999998e-19
6,z_score_fusion_lexical_only,0.912013,0.950484,0.844337,0.796901,0.83038,97.0,169.0,4.010319e-08,True,...,822.0,1164.0,7.887559e-12,True,7.098803e-11,516.0,776.0,1.095911e-10,True,9.863201e-10
7,BM25,0.868396,0.950484,0.789557,0.726441,0.761615,140.0,855.0,2.929341e-107,True,...,828.0,2810.0,2.436857e-204,True,2.193171e-203,658.0,2217.0,4.585937e-173,True,4.127343e-172
8,BM25+TASB,0.907884,0.950484,0.837675,0.78825,0.821912,102.0,239.0,1.938337e-16,True,...,848.0,1453.0,8.559219000000001e-32,True,7.703297000000001e-31,560.0,1032.0,2.784065e-27,True,2.5056589999999997e-26
9,BM25+Cont,0.918331,0.950484,0.843315,0.79531,0.827909,306.0,294.0,0.7107337,False,...,1496.0,1716.0,2.483615e-05,True,0.0002235253,1119.0,1328.0,1.206262e-05,True,0.0001085635
