## 0. Prerequisites
Create a new conda environment from this configuration.

In [21]:
!conda env create -f rp-splade.yml

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Retrieving notices: ...working... ^C


After creation, search for **'splade_final'** in your IDEs available interpreters and **select** it for this notebook.

If you successfully selected it, then the next cell should pass the assertion.

In [4]:
import subprocess

result = subprocess.check_output("conda info | grep 'active environment'", shell=True)
result_str = result.decode('utf-8').strip().split(" : ")[1]

expected_value = "rp-splade"
assert result_str == expected_value, f"Expected value: {expected_value}; Actual value: {result_str}"

## 1. Needed libraries

In [2]:
import torch
import pyterrier as pt
from pathlib import Path
from pyterrier.measures import RR, nDCG, MAP, R, MRR
from fast_forward.encoder import TCTColBERTQueryEncoder, TCTColBERTDocumentEncoder, TransformerEncoder
from fast_forward import OnDiskIndex, Mode, Indexer
from fast_forward.util.pyterrier import FFScore, FFInterpolate
import io
from contextlib import redirect_stdout
import pandas as pd

## 2. Set-up
### 2.1. Encoder settings

In [6]:
# Transformer name
transformer_name = 'castorini/tct_colbert-msmarco'

# Dual-encoder architecture
q_encoder = TCTColBERTQueryEncoder(transformer_name)
d_encoder = TCTColBERTDocumentEncoder(
    transformer_name, device="cuda:0" if torch.cuda.is_available() else "cpu"
)

Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at castorini/tct_colbert-msmarco were not used when initializing BertModel

### 2.2. Initialize Pyterrier

In [3]:
# Initialize PyTerrier
if not pt.started():
    pt.init(tqdm="notebook")

PyTerrier 0.10.0 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### 2.3. Required functions

In [5]:
# Dataset names (human readable format)
dataset_names = ['fiqa', 'nfcorpus', 'scifact', 'quora', 'hotpotqa', 'dbpedia', 'fever']

testset_names = ['irds:beir/fiqa/test', 'irds:beir/nfcorpus/test', 'irds:beir/scifact/test',
                 'irds:beir/quora/test', 'irds:beir/hotpotqa/test', 'irds:beir/dbpedia-entity/test',
                 'irds:beir/fever/test']

# List of sparse indexes (relative paths from the current directory) that are going to be used in the evaluations
sparse_indexes = ['sparse_indexes/sparse_index_fiqa', 'sparse_indexes/sparse_index_nfcorpus', 
                  'sparse_indexes/sparse_index_scifact', 'sparse_indexes/sparse_index_quora',
                  'sparse_indexes/sparse_index_hotpotqa',
                  'sparse_indexes/sparse_index_dbpedia-entity_with_encoding',
                  'sparse_indexes/sparse_index_fever_with_encoding']

dense_indexes = ['dense_indexes/ffindex_fiqa_tct.h5', 'dense_indexes/ffindex_nfcorpus_tct.h5',
                 'dense_indexes/ffindex_scifact_tct.h5', 'dense_indexes/ffindex_quora_tct.h5',
                 'dense_indexes/ffindex_hotpotqa_tct.h5', 'dense_indexes/ffindex_dbpedia-entity_tct.h5',
                 'dense_indexes/ffindex_fever_tct.h5']

In [6]:
class SwapQueries(pt.Transformer):
    def __init__(self):
        super().__init__()

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        if 'query_0' in df:
            df_new = df.copy()
            df_new["query_copy"] = df_new["query"]
            df_new["query"] = df_new["query_0"]
            df_new["query_0"] = df_new["query_copy"]
            df_new = df_new.drop(columns=["query_copy"])
            return df_new
        return df

def read_alpha_from_file(retrieval_model_name: str, dataset_name: str) -> float:
    with open(f"alpha_tuning/{retrieval_model_name}/{dataset_name}.txt", "r") as file:
        lines = [line for line in file]
        return float(lines[-2].split(": ")[1])

def ff_with_swap(input, ff_score: FFScore, alpha: float) -> pd.DataFrame:
    return (SwapQueries() >> ff_score >> FFInterpolate(alpha))(input)

def ff(input, ff_score: FFScore, alpha: float) -> pd.DataFrame:
    return (ff_score >> FFInterpolate(alpha))(input)

def retrieve(topics, model) -> pd.DataFrame:
    return model(topics)

def measure_retrieval(test_set, curr_index, model, model_name: str) -> pd.DataFrame:
    topics = test_set.get_topics()
    
    f = io.StringIO()
    with redirect_stdout(f):
        %timeit retrieve(topics, model)

    # Extract the output from the StringIO object
    timeit_result = f.getvalue()
    elapsed_time = float(timeit_result.split(" s +- ")[0]) * 1000 # convert miliseconds to seconds
    number_of_queries = len(test_set.get_topics())

    with open(f"latency/{model_name}/{dataset_names[curr_index]}.txt", "a") as file1:
        file1.write('Latency for retrieval:\n')
        file1.write(f"Timeit result: {timeit_result}")
        file1.write(f"There are {number_of_queries} queries in the dataset.\n")
        file1.write(f"This comes to {round(float(elapsed_time / number_of_queries), 2)} ms/query \n\n")

    return retrieve(topics, model)

def measure_reranking(input_df: pd.DataFrame, test_set, curr_index, model_name: str, ff_score=None, alpha=None):
    f = io.StringIO()
    with redirect_stdout(f):
        if model_name.lower() in ['deepct', 'splade']:
            %timeit ff_with_swap(input_df, ff_score, alpha)
        else:
            %timeit ff(input_df, ff_score, alpha)

    # Extract the output from the StringIO object
    timeit_result = f.getvalue()
    elapsed_time = float(timeit_result.split(" s +- ")[0]) * 1000 # convert miliseconds to seconds
    number_of_queries = len(test_set.get_topics())

    with open(f"latency/{model_name}/{dataset_names[curr_index]}.txt", "a") as file1:
        file1.write('Latency for re-ranking:\n')
        file1.write(f"Timeit result: {timeit_result}")
        file1.write(f"There are {number_of_queries} queries in the dataset.\n")
        file1.write(f"This comes to {round(float(elapsed_time / number_of_queries), 2)} ms/query \n\n\n")

def measure_latency(test_set, curr_index, model, model_name: str, ff_score=None, alpha=None):
    retrieval = measure_retrieval(test_set, curr_index, model, model_name)
    measure_reranking(retrieval, test_set, curr_index, model_name, ff_score, alpha)

def measure_reranking_sprint(file_path, test_set, curr_index, model_name, ff_score, alpha):
    import pandas as pd

    # Read the TSV file into a DataFrame without header
    df = pd.read_csv(file_path, sep=' ', header=None)

    # Assign column names
    df.columns = ['qid', 'ignore_1', 'docno', 'rank', 'score', 'ignore_2']

    # Optionally, drop the columns that you don't need
    df = df[['qid', 'docno', 'rank', 'score']]

    topics = test_set.get_topics()
    topics['qid'] = topics['qid'].astype('str')
    df['qid'] = df['qid'].astype('str')

    df_merged = pd.merge(df, topics, on='qid')
    df_merged['docno'] = df_merged['docno'].astype('str')

    measure_reranking(df_merged, test_set, curr_index, model_name, ff_score, alpha)

    

## 3. Evaluation

In [7]:
def logic(curr_index):
    # Load ffindex for the current dataset
    ff_index = OnDiskIndex.load(
        Path(dense_indexes[curr_index]).resolve(), query_encoder=q_encoder, mode=Mode.MAXP
    )

    # Load it into the main memory
    ff_index = ff_index.to_memory()

    # Initialize Fast-Forward Indexes
    ff_score = FFScore(ff_index)

    # Load sparse indexes
    index_ref = pt.IndexFactory.of(Path(sparse_indexes[curr_index]).resolve().as_posix(), memory=True)
    deepct_index_ref = pt.IndexFactory.of(Path(sparse_indexes[curr_index] + "_deepct").resolve().as_posix(), memory=True)
    splade_index_ref = pt.IndexFactory.of(Path(sparse_indexes[curr_index] + "_splade").resolve().as_posix(), memory=True)

    # Retrieval models
    bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")
    alpha_bm25 = read_alpha_from_file('bm25', dataset_names[curr_index])

    tf_idf = pt.BatchRetrieve(index_ref, wmodel="TF_IDF")
    alpha_tfidf = read_alpha_from_file('tf_idf', dataset_names[curr_index])

    deep_ct = pt.BatchRetrieve(deepct_index_ref, wmodel="BM25")
    alpha_deepct = read_alpha_from_file('deepct', dataset_names[curr_index])

    # Splade
    import pyt_splade
    factory = pyt_splade.SpladeFactory()
    splade = factory.query() >> pt.BatchRetrieve(splade_index_ref, wmodel="Tf")
    alpha_splade = read_alpha_from_file('splade', dataset_names[curr_index])

    test_set = pt.get_dataset(testset_names[curr_index])

    # Measure latency: BM25 + FF
    measure_latency(model=bm25, alpha=alpha_bm25, model_name="bm25", test_set=test_set, curr_index=curr_index, ff_score=ff_score)
    print('Finished for BM25')

    # Measure latency: TF-IDF + FF
    measure_latency(model=tf_idf, alpha=alpha_tfidf, model_name="tfidf", test_set=test_set, curr_index=curr_index, ff_score=ff_score)
    print('Finished for TF-IDF')

    # Measure latency: DeepCT + FF
    measure_latency(model=deep_ct, alpha=alpha_deepct, model_name="deepct", test_set=test_set, curr_index=curr_index, ff_score=ff_score)
    print('Finished for DeepCT')

    # Measure latency: SPLADE + FF
    measure_latency(model=splade, alpha=alpha_splade, model_name="splade", test_set=test_set, curr_index=curr_index, ff_score=ff_score)
    print('Finished for SPLADE')

    # Measure latency: uniCOIL + FF
    alpha_unicoil = read_alpha_from_file('unicoil', dataset_names[curr_index])
    measure_reranking_sprint(file_path=f'runs/unicoil_{dataset_names[curr_index]}_test_run.tsv',
                            test_set=test_set, curr_index=curr_index, model_name="unicoil",
                            ff_score=ff_score, alpha=alpha_unicoil)

    # Measure latency: DeepImpact + FF
    alpha_deepimpact = read_alpha_from_file('deepimpact', dataset_names[curr_index])
    measure_reranking_sprint(file_path=f'runs/deepimpact_{dataset_names[curr_index]}_test_run.tsv',
                            test_set=test_set, curr_index=curr_index, model_name="deepimpact",
                            ff_score=ff_score, alpha=alpha_deepimpact)

In [9]:
for curr_index in range(1, 3):
    logic(curr_index)

100%|██████████| 3633/3633 [00:00<00:00, 1690658.65it/s]


Finished for BM25
Finished for TF-IDF
Finished for DeepCT




Finished for SPLADE


100%|██████████| 5183/5183 [00:00<00:00, 2318834.95it/s]


Finished for BM25
Finished for TF-IDF
Finished for DeepCT




Finished for SPLADE
