# Imports

In [1]:
import pathlib
import sys
import os
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import torch
import numpy as np
import pyterrier as pt
from pathlib import Path

from pyterrier.datasets import Dataset
from pyterrier.measures import *
from fast_forward.encoder import TASBEncoder, ContrieverEncoder
import torch
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFInterpolate
from fast_forward.util import Indexer
from pyterrier.terrier import Retriever

from fusions.FFTM2C2 import FFTM2C2
from fast_forward.util.pyterrier import FFScore

from fusions.experiment import fuse_convex_norm

device="cuda:0" if torch.cuda.is_available() else "cpu"
print(torch.__version__)


  from .autonotebook import tqdm as notebook_tqdm


2.6.0+cu124


# Dataset Selection

In [2]:
# Dataset Selection: https://pyterrier.readthedocs.io/en/latest/datasets.html
dataset_name = "irds:beir/fiqa"
dataset = pt.get_dataset(dataset_name)
testset = pt.get_dataset(dataset_name + "/test")

# Indexing
indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
beir/fiqa documents: 100%|██████████| 57638/57638 [00:11<00:00, 4803.97it/s]


# Retriever Configuration

In [3]:
# BM25
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")
tasb_q_encoder =  tasb_d_encoder = TASBEncoder(device=device)
con_q_encoder = con_d_encoder = ContrieverEncoder(device=device)

In [4]:
safe_dataset_name = dataset_name.replace(":", "_").replace("/", "_")
# Define index paths for both models
tasb_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_tasb.h5")
con_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_con.h5")

def load_or_create_index(index_path: pathlib.Path, q_encoder, d_encoder):
    print(index_path.exists())
    try:
        ff_index = OnDiskIndex.load(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
    except FileNotFoundError:
        index_path.parent.mkdir(exist_ok=True, parents=True)
        ff_index = OnDiskIndex(
            index_path,
            query_encoder=q_encoder,
            mode=Mode.MAXP,
        )
        from fast_forward.util import Indexer

        def docs_iter():
            for d in dataset.get_corpus_iter():
                yield {"doc_id": d["docno"], "text": d["text"]}

        Indexer(ff_index, d_encoder, batch_size=8).from_dicts(docs_iter())

    return ff_index.to_memory()

tasb_index = load_or_create_index(tasb_index_path, tasb_q_encoder, tasb_d_encoder)
con_index = load_or_create_index(con_index_path, con_q_encoder, con_d_encoder)



True


100%|██████████| 57638/57638 [00:00<00:00, 443034.08it/s]


True


100%|██████████| 57638/57638 [00:00<00:00, 435813.28it/s]


In [12]:

ff_tasb = FFScore(tasb_index)
ff_con = FFScore(con_index)
RANK_CUTOFF = 50# change this for experiments

# cutoff should be 1000 for experiments
pipeline_0 = (bm25 % RANK_CUTOFF)
pipeline_1 = bm25 % RANK_CUTOFF>> ff_tasb
pipeline_2 = bm25 % RANK_CUTOFF >> ff_con




In [13]:

def get_pipeline_result(pipeline: Retriever, ds: Dataset):
    return pipeline.transform(ds.get_topics())

res_1 = get_pipeline_result(pipeline_0,testset)
res_2 = get_pipeline_result(pipeline_1,testset)
res_3 = get_pipeline_result(pipeline_2,testset)


In [14]:
fuse_res = fuse_convex_norm(
    df1=res_1,
    df2=res_2,
    df3=res_3,
    w1=0.2,
    w2=0.4,
    w3=0.4,
    normalization_method_1="min_max",
    normalization_method_2="min_max",
    normalization_method_3="min_max",
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=["rank"], errors="ignore", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["rank"] = df.groupby("qid", sort=False)["score"].rank(ascending=False, method="first").astype(int) -1 + FIRST_RANK


In [17]:
result = pt.Experiment(

    [res_1,res_2,res_3, fuse_res],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR @ 10, nDCG @ 10, MAP @ 100],
    names=['bm25', 'bm25 >> TASB', 'bm25 >> Cont', 'fused'],
    baseline=0,
    correction="bonferroni"
)

print(result)


           name   nDCG@10    AP@100     RR@10  nDCG@10 +  nDCG@10 -  \
0          bm25  0.252589  0.206640  0.310271        NaN        NaN   
1  bm25 >> TASB  0.306769  0.250428  0.371344      204.0      111.0   
2  bm25 >> Cont  0.278087  0.223183  0.342730      189.0      132.0   
3         fused  0.319821  0.261455  0.393266      209.0       81.0   

   nDCG@10 p-value  nDCG@10 reject  nDCG@10 p-value corrected  AP@100 +  \
0              NaN           False                        NaN       NaN   
1     6.801317e-09            True               2.040395e-08     231.0   
2     8.707841e-03            True               2.612352e-02     228.0   
3     1.019503e-15            True               3.058508e-15     255.0   

   AP@100 -  AP@100 p-value  AP@100 reject  AP@100 p-value corrected  RR@10 +  \
0       NaN             NaN          False                       NaN      NaN   
1     142.0    5.456308e-07           True              1.636892e-06    170.0   
2     156.0    6.522488e-