In [2]:
# Get the queries decomposed
# This is the dataset that we will be actually using - one claim gets split into several (using ClaimDecomp) and then it comes with a top-1 evidence for each
import numpy as np
import pandas as pd
import os
import json
import pyterrier as pt
import gc
import shutil
import torch 
import re 

from rerankers import Document, Reranker
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


TODO:
* Experiment with bm25s to make it faster and also work with the stemmer for numbers - https://github.com/xhluca/bm25s 
* Refactor the custom tokenizer for BM25 here and verify how good it is.
* Follow up on https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt#tokenizers and https://huggingface.co/spaces/huggingface/number-tokenization-blog
* Investigate what else in terms of tokenization/adaptation for BM25 can be done already at this step? (keepign in mind this first step is a word based tokenizer)
* Investigate parallelization with python-terrier: https://pyterrier.readthedocs.io/en/latest/parallel.html

In [3]:
# notebook settings:

top_k = 100 #we will rerank only the first 100 BM25 retrieved 
top_reranked = 25 #we store only the first 25 reranked
gpu = False #For the reranking part

# Rerankers for GPU

if gpu:
    # reranking_model = "sentence-transformers/paraphrase-MiniLM-L6-v2" # This is the one the people from the QuanTemp paper used
    reranking_model = 'mixedbread-ai/mxbai-rerank-large-v1' # This is recommended by the reranking package
    ranker = Reranker(reranking_model, model_type='cross-encoder')
else:
    reranking_model = "flashrank" #This is a reranker which is optimized for CPU
    ranker = Reranker(reranking_model)

# needed for pyterrier
if not pt.java.started():
    pt.java.init()

# rebuild the index?
clean_index = True

wmodel = "BM25" # retriever model


Loading default flashrank model for language en
Default Model: ms-marco-MiniLM-L-12-v2
Loading FlashRankRanker model ms-marco-MiniLM-L-12-v2 (this message can be suppressed by setting verbose=0)
Loading model FlashRank model ms-marco-MiniLM-L-12-v2...


Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


# Load an inspect the datasets

Create and prepare full evidence corpus for retrieval task


In [4]:
#  https://drive.google.com/drive/folders/1GYzSK0oU2MiaKbyBO3hE8kO4gdmxDjCv
with open("../../data/corpus_evidence_unified.json") as f:
  corpus = json.load(f)

# prepare for pyterier
corpus_data = [{"docno": str(idx), "text": text} for idx, text in corpus.items()]
corpus_df = pd.DataFrame(corpus_data)

Create train and validation claim datasets (original claims)


In [5]:
with open("../../data/English/train_claims_quantemp.json") as f:
  train_data = json.load(f)

with open("../../data/English/val_claims_quantemp.json") as f:
  val_data = json.load(f)

def prep_claims_for_retrieval(data):
  list_claims = []

  for claim_id, claim in enumerate(data):
      list_claims.append((claim_id, claim['claim']))
  
  return pd.DataFrame(list_claims, columns=["qid", "query"])
# Convert claims to DataFrame
claims_df_train = prep_claims_for_retrieval(train_data)
claims_df_val = prep_claims_for_retrieval(val_data)

In [6]:
# Fix some character problems that are specific to pyterier - Important, see: 

# https://github.com/terrier-org/pyterrier/issues/253

tokenizer = pt.java.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()

def strip_markup(text):
    text = re.sub(r'[\\/]+', ' ', text)
    
    return " ".join(tokenizer.getTokens(text))

claims_df_train = pt.apply.query(lambda r: strip_markup(r.query))(claims_df_train)
claims_df_val = pt.apply.query(lambda r: strip_markup(r.query))(claims_df_val)

In [None]:
#claims_df_val.iloc[3,:]['query']

# Step 1: Sparse Retrieval (BM25)

In [7]:
# Prepare the corpus and build retriever
# Define index path
index_path = os.path.abspath(f"./pyterrier_index")

if clean_index:
    # Recreate the index if necessary
    if os.path.exists(index_path):
        shutil.rmtree(index_path)

# Index the corpus
indexer = pt.IterDictIndexer(index_path)
index_ref = indexer.index(corpus_df.to_dict("records"))

# Initialize BM25 retriever
bm25 = pt.terrier.Retriever(index_ref, 
                            wmodel=wmodel,
                            num_results=top_k) # there are other options here

17:14:32.493 [ForkJoinPool-1-worker-3] WARN org.terrier.structures.indexing.Indexer -- Indexed 5 empty documents


In [8]:
def retrieve_evidence_for_claims(retriever, claims_df):
    """Run and post-process query retrieval results and store them all in one final dataframe """
    retrieval_results = retriever.transform(claims_df).sort_values(by=['qid', 'rank'])

    # Format results
    retrieval_results.drop(columns=['rank'], inplace=True)
    retrieval_results.rename(columns={'score':'score_retriever'}, inplace=True)
    retrieval_results = retrieval_results.merge(corpus_df, on="docno", how="left")

    return retrieval_results

In [None]:
#retrievalbm25_train = retrieve_evidence_for_claims(bm25, claims_df_train)
retrievalbm25_val =  retrieve_evidence_for_claims(bm25, claims_df_val)

# Step 2: Reranking



We use rerankers library for this step, for reference refer to the repo readme here: https://github.com/AnswerDotAI/rerankers and the blogpost here: https://www.answer.ai/posts/2024-09-16-rerankers.html 

Ideally, we should use cross-encoder here as the dataset is small. In a cross encoder, both query and document tokens are taken together into a single transformer based network. As opposed to a bi-encoder, which deos this separately for query and document and then computes the cosinus.


In [None]:
retrievalbm25_train_grouped = retrievalbm25_train.groupby('qid')
retrievalbm25_val_grouped = retrievalbm25_val.groupby('qid')

In [None]:
def reranking(ranker, retrieved_query_evidence, top_reranked=25, gpu=False):
    rerank_results = []
    for i, group in tqdm(retrieved_query_evidence):
        query = group['query'].iloc[0]
        docs = group['text'].tolist()
        doc_nos = group['docno'].tolist()
      
        if gpu:
            with torch.no_grad():
                rerank_result = ranker.rank(query=query, docs=docs, doc_ids=doc_nos)
                rerank_results.append((group['qid'].iloc[0], rerank_result.top_k(top_reranked)))
        else:
            rerank_result = ranker.rank(query=query, docs=docs, doc_ids=doc_nos)
            rerank_results.append((group['qid'].iloc[0], rerank_result.top_k(top_reranked)))

        del rerank_result
        del docs
        del doc_nos

        gc.collect()
        
    return rerank_results
        #torch.cuda.empty_cache()


In [None]:
# Optional - just do it for the first 100 in order to subset the data and inspect the result quicker
import itertools
retrievalbm25_train_grouped_first100 = itertools.islice(retrievalbm25_train_grouped, 100)
retrievalbm25_val_grouped_first100 = itertools.islice(retrievalbm25_val_grouped, 100)

reranked_train = reranking(ranker=ranker, retrieved_query_evidence = retrievalbm25_train_grouped_first100, top_reranked=25, gpu=False)
reranked_val = reranking(ranker=ranker, retrieved_query_evidence = retrievalbm25_val_grouped_first100, top_reranked=25, gpu=False)