# Information Retrieval

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

import torch
import numpy as np
import pyterrier as pt
from pathlib import Path
from pyterrier.measures import RR, nDCG, MAP
from fast_forward.encoder import TASBEncoder
import torch
from fast_forward.index import OnDiskIndex, Mode
from fast_forward.util import Indexer
from fast_forward.util.pyterrier import FFInterpolate
from fast_forward.util import Indexer

from fusions.FFTM2C2 import FFTM2C2
from fast_forward.util.pyterrier import FFScore
device="cuda:0" if torch.cuda.is_available() else "cpu"
print(torch.__version__)

### Fusion Functions

In [14]:
# Implement fusion functions reference: 
# https://github.com/mrjleo/fast-forward-indexes/blob/main/src/fast_forward/util/pyterrier.py
class FFRRF(pt.Transformer):
    """
    Fusion function implementing Reciprocal Rank Fusion (RRF):
    - Computes hard ranks for BM25 and neural scores.
    - Final score is the sum of reciprocals: 1/(k + rank) for each.
    """
    def __init__(self, k=60):
        self.k = k
        super().__init__()
        
    def transform(self, df):
        """Transform using the RRF fusion method."""
        new_df = df[["qid", "docno", "query"]].copy()
        bm25_rank = df['score_0'].rank(method='min', ascending=False)
        neural_rank = df['score'].rank(method='min', ascending=False)
        new_df['score'] = 1 / (self.k + bm25_rank) + 1 / (self.k + neural_rank)
        return pt.model.add_ranks(new_df, single_query=False)


class FFSRRF(pt.Transformer):
    """
    Fusion function implementing Soft Reciprocal Rank Fusion (SRRF):
    - Computes a soft rank for BM25 and neural scores using a logistic function.
    - Final score is computed similarly to RRF, but using the soft ranks.
    """
    def __init__(self, k=60, beta=1.0):
        self.k = k
        self.beta = beta
        super().__init__()

    def transform(self, df):
        """Transform using the SRRF fusion method."""
        new_df = df[["qid", "docno", "query"]].copy()

        def compute_soft_rank(scores):
            n = len(scores)
            soft_ranks = np.ones(n)
            for i in range(n):
                soft_ranks[i] += np.sum(1 / (1 + np.exp(self.beta * (scores[i] - scores)))) - 1
            return soft_ranks

        bm25_scores = df['score_0'].values.astype(np.float32)
        neural_scores = df['score'].values.astype(np.float32)
        sr_bm25 = compute_soft_rank(bm25_scores)
        sr_neural = compute_soft_rank(neural_scores)
        new_df['score'] = 1 / (self.k + sr_bm25) + 1 / (self.k + sr_neural)
        return pt.model.add_ranks(new_df, single_query=False)


### Choose Datasets

In [15]:
# Dataset Selection: https://pyterrier.readthedocs.io/en/latest/datasets.html
dataset_name = "irds:beir/fiqa"
dataset = pt.get_dataset(dataset_name)
testset = pt.get_dataset(dataset_name + "/test")

# Indexing
indexer = pt.IterDictIndexer(
    str(Path.cwd()),  # this will be ignored
    type=pt.index.IndexingType.MEMORY,
)
index_ref = indexer.index(dataset.get_corpus_iter(), fields=["text"])

### Set up Retrievers

In [16]:
from fast_forward.encoder import ContrieverEncoder

# BM25
bm25 = pt.terrier.Retriever(index_ref, wmodel="BM25")

# Encoding
# To change the encoder, consult:
# https://github.com/mrjleo/fast-forward-indexes/blob/main/src/fast_forward/encoder/transformer.py

# Since china doesn't have access to huggingface, I have manually downloaded the model, feel free to comment this.
# ------------------ From Here
from transformers import AutoTokenizer, AutoModel
# Contriever taken from huggingface
local_model_path = 'C:/Users/win10 pro/Desktop/Danae_temp/ir-project/huggingface/TASBEncoder'
tokenizer = AutoTokenizer.from_pretrained(local_model_path, local_files_only=True)
model = AutoModel.from_pretrained(local_model_path, local_files_only=True)
q_encoder = d_encoder = TASBEncoder(model=local_model_path, device=device)
# -------------------- To here

# And uncomment this
# q_encoder = d_encoder = TASBEncoder(device=device)

In [17]:
safe_dataset_name = dataset_name.replace(":", "_").replace("/", "_")
ff_index_path = Path(f"../indexes/ffindex_{safe_dataset_name}_tasb2.h5")
    # Path.cwd().joinpath("indexes", f"ffindex_{safe_dataset_name}_tasb2.h5"))
print(ff_index_path.exists())

# Create parent directory if it doesn't exist.
# try: 
ff_index = OnDiskIndex.load(
    ff_index_path,
    query_encoder=q_encoder,
    mode=Mode.MAXP,
)
# except FileNotFoundError:
#     ff_index_path.parent.mkdir(exist_ok=True, parents=True)
#     ff_index = OnDiskIndex(
#         ff_index_path,
#         query_encoder=q_encoder,
#         mode=Mode.MAXP,
#     )
#     from fast_forward.util import Indexer
#     def docs_iter():
#         for d in dataset.get_corpus_iter():
#             yield {"doc_id": d["docno"], "text": d["text"]}
# 
#     Indexer(ff_index, d_encoder, batch_size=8).from_dicts(docs_iter())


ff_index = ff_index.to_memory()

### Get scores and setup fusion techniques

In [18]:
ff_score = FFScore(ff_index)
candidates = (bm25 % 5)(testset.get_topics())
re_ranked = ff_score(candidates)

hybrid = bm25 % 1000 >> ff_score
ff_int = FFInterpolate(alpha=0.5)
ff_int(re_ranked)
ff_tm2c2 = FFTM2C2()
ff_tm2c2(re_ranked)
ff_rrf = FFRRF()
ff_srrf = FFSRRF()
ff_rrf(re_ranked)
ff_srrf(re_ranked)

### Experiment/Evaluation

In [19]:
result = pt.Experiment(
    [bm25, hybrid >> ff_int,  hybrid >> ff_tm2c2],
    testset.get_topics(),
    testset.get_qrels(),
    eval_metrics=[RR(10), nDCG(10), MAP(100)],
    names=["BM25", "linear(alpha = 0.5)", "TM2C2"],
    baseline=0,
    correction="bonferroni"
)

print(result)

