In [36]:
# Get the queries decomposed
# This is the dataset that we will be actually using - one claim gets split into several (using ClaimDecomp) and then it comes with a top-1 evidence for each
import json
from rank_bm25 import BM25Okapi
import re
import dateparser
from joblib import Parallel, delayed
from multiprocessing import Manager
import numpy as np

import utils


# Load an inspect the datasets

Full corpus


In [37]:
#  https://drive.google.com/drive/folders/1GYzSK0oU2MiaKbyBO3hE8kO4gdmxDjCv
with open("../../data/corpus_evidence_unified.json") as f:
  corpus = json.load(f)


  

Supposed (wrong) bm25_top_100_claimdecomp.json

In [38]:
with open("../../data/bm25_top_100_claimdecomp.json") as f:
  bm25_top_100_claimdecomp = json.load(f)


In [39]:
print('number of scores doc 1:{}'.format(len(bm25_top_100_claimdecomp[0]['scores'])))
print('number of doc_Id doc 1:{}'.format(len(bm25_top_100_claimdecomp[0]['doc_id'])))
 

number of scores doc 1:300
number of doc_Id doc 1:252


In [18]:
# Bm25 evidence files content - seems to be validation only

with open("../../data/English/bm25_evidence/val_claim_only_evidence_final.json") as f:
  bm25_evidence_val_claim_only = json.load(f)


In [None]:
len(bm25_evidence_val_claim_only) # as the name suggests, we do not have the train top 100 here.append
bm25_evidence_val_claim_only[0]['scores']

claim_to_bm25_evidence_mapping

In [33]:

with open("../../data/English/claim_to_bm25_evidence_mapping/train_claimonly_evidence_question_mapping.json") as f:
  train_claimonly_evidence_question_mapping = json.load(f)


In [32]:
# This is top 3 per original claim
train_claimonly_evidence_question_mapping

[{'claim': 'In her budget speech, Nirmala Sitharaman claimed that the Government distributed 35,000 crore LED bulbs in the country.',
  'evidences': [{'questions': 'In her budget speech, Nirmala Sitharaman claimed that the Government distributed 35,000 crore LED bulbs in the country.',
    'top_k_doc': ["conclusion: nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video fo the speech it is clear that she said 35 crores. nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video of the speech, it is clear that she said 35 crores, not 35000 crores.",
     'jul 5, 2019  approximately 35 crore led bulbs have been distributed under ujala yojana leading to cost saving of 18,341 crores annually.66 pages',
     '5 gen 2022  the power ministry has distributed 36.78 crore led lights under the ujala programme in seven years, which saved 47778 million units of ...']}],
  'label': 'False'},
 {'claim': 'Florida residents affec

In [34]:

with open("../../data/English/decomposed_questions_with_mapped_bm25_evidence/train_claimdecomp_evidence_question_mapping.json") as f:
  train_claimdecomp_evidence_question_mapping = json.load(f)

In [35]:
train_claimdecomp_evidence_question_mapping

[{'claim': 'In her budget speech, Nirmala Sitharaman claimed that the Government distributed 35,000 crore LED bulbs in the country.',
  'evidences': [{'questions': 'did nirmala sitharaman claim that the government distributed 35,000 crore led bulbs in the country?\n',
    'top_k_doc': ["conclusion: nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video fo the speech it is clear that she said 35 crores. nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video of the speech, it is clear that she said 35 crores, not 35000 crores."]},
   {'questions': ' is there evidence to support the claim that the government distributed 35,000 crore led bulbs?\n',
    'top_k_doc': ["conclusion: nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video fo the speech it is clear that she said 35 crores. nirmala sitharaman didn't claim that govt distributed 35,000 crores led bulbs. from the video

TODO:
* Experiment with bm25s to make it faster and also work with the stemmer for numbers - https://github.com/xhluca/bm25s 
* Refactor the custom tokenizer for BM25 here and verify how good it is.
* Follow up on https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt#tokenizers and https://huggingface.co/spaces/huggingface/number-tokenization-blog
* Investigate what else in terms of tokenization/adaptation for BM25 can be done already at this step? (keepign in mind this first step is a word based tokenizer)

# PART 1: BM25 RETRIEVAL

The goal of the following tokenizer function is to introduce some normalizations - e.g. for covid, for numbers, for dates, for crore, such that we aid BM25 into recognizing simlarities in the words.

Numbers in BM25

    Tokenization:
        When tokenizing text, numbers are typically split based on spaces or punctuation, just like words. For example:
            "In 2023, 10% of users" → Tokens: ["In", "2023", "10", "of", "users"].
        Each number is treated as an independent token, similar to a word.

    Scoring:
        BM25 assigns a term frequency (TF) and inverse document frequency (IDF) to numbers just like it does to any other word.
        If a number (e.g., "2023") appears frequently in both the query and the document, it will contribute to the document's relevance score.

    Implications:
        Numbers in claims and evidence (e.g., "86,000" or "5%") are matched based on their exact representation.
        BM25 does not understand numerical relationships or semantic context. For example:
            It cannot recognize that "10%" is similar to "0.1" or "10 percent".
            It does not reason about magnitudes or intervals (e.g., "86,000" vs. "85,000").

Limitations of Treating Numbers as Words

    Exact Match Dependency:
        If the number format in the query does not exactly match the format in the document (e.g., "10%" vs. "10 percent"), BM25 will not treat them as equivalent.

    No Numerical Reasoning:
        BM25 cannot interpret numerical relations like comparisons ("greater than", "equal to") or ranges ("10-20").

    Potential for Noise:
        Numbers that are irrelevant to the query might still appear frequently in documents, leading to noise in retrieval.


Load evidence corpus

In [2]:
with open("../../data/corpus_evidence_unified.json") as f:
  corpus = json.load(f)

Load claim corpus

In [None]:
with open("../../data/English/train_claims_quantemp.json") as f:
  train_data = json.load(f)

with open("../../data/English/val_claims_quantemp.json") as f:
  val_data = json.load(f)
 

In [5]:
all_claims = []
all_synthetic_questions = []

for claim_id, claim in enumerate(train_data):
    all_claims.append((claim_id, claim['claim']))
    for question_id, j in enumerate(claim['evidences']):
        all_synthetic_questions.append(((claim_id, question_id), j['questions']))

In [6]:
 def preprocess_corpus(evidence_dict):
    # Convert dict of documents to a list
    evidence_list = list(evidence_dict.values())

    # Parallel tokenization
    tokenized_corpus = Parallel(n_jobs=-1)(
        delayed(tokenizer)(doc, claim_id) for claim_id, doc in enumerate(evidence_list)
    )
 
    return tokenized_corpus

In [7]:
def preprocess_queries(queries_data):
    # Parallel tokenization
    tokenized_corpus = Parallel(n_jobs=-1)(
        delayed(tokenizer)(doc, claim_question_id) for claim_question_id, doc in queries_data
    )
    return tokenized_corpus

In [8]:
tokenized_queries = preprocess_queries(all_synthetic_questions)

In [9]:
tokenized_evidence = preprocess_corpus(corpus)

In [10]:
# # Are all of them split in 3 questions? probably not - verify that
# train_data

In [11]:
# Function to process a single query, return back indices and scores
def process_query(query_tokens, id, top_k=100):
    global bm25
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]  # Get top-k indices
    top_scores = np.sort(scores)[::-1][:top_k]
    return (id, top_indices, top_scores)

In [12]:
# Initialize BM25
def init_bm25(evidence_corpus):
    global bm25
    bm25 = BM25Okapi(evidence_corpus)


In [13]:
evidence_corpus = [j for x, j in tokenized_evidence]
init_bm25(evidence_corpus)

In [14]:
 # Process All queries in parallel
results = Parallel(n_jobs=-1, backend="multiprocessing")(
        delayed(process_query)(query, id) for id, query in tokenized_queries
    )
    

In [15]:
import pickle

# Save an object to a file
def save_pickle(obj, filename):
    """
    Save a Python object to a file using pickle.
    
    Args:
        obj: The Python object to save.
        filename: The name of the file where the object will be saved.
    """
    with open(filename, 'wb') as f:
        pickle.dump(obj, f)

# Example usage
data = {'key': 'value', 'numbers': [1, 2, 3, 4, 5]}
save_pickle(results, '../../output/bm25-1st-step-evidence.pkl')
save_pickle(tokenized_queries, '../../output/tokenized_queries.pkl')


# PART 2: SEMANTIC RERANKING WITH TRANSFORMERS

In [None]:
# Given the corpus above, how can we re-rank?
# https://huggingface.co/models?search=modernbert
# https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=trending&search=modernbert
# So here for this step we can possibly re-learn things or?



In [19]:
import pickle

with open('../../output/bm25-1st-step-evidence.pkl', 'rb') as file: 
      
    # Call load method to deserialze 
    results = pickle.load(file) 

In [None]:
all_synthetic_questions[1]


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("tasksource/ModernBERT-base-embed")
# Try here the tokenization steps that are in the other paper with the numerical reasoning things
# Add the scientific notation too, maybe? Or at least some other normalization step
# Alternative with the nomics as it is a smaller model, might be easier for you to rerank
# Maybe we can finetune the model on FinQA (and similar) using hte tokenizer that is suggested in the other dataset
# It would be interesting to see how much correlation is there between BM25 and this stuff btw.
# Need a different workaround here, that does not consume much CUDA memory and you can rerun everything without blowing up the memory.


# Load the tokenizer and model


# # Encode a single sentence
# def encode(text):
#     tokens = tokenizer(text, return_tensors="pt", padding=True)
#     with torch.no_grad():
#         output = model(**tokens)
#         embeddings = output.last_hidden_state.mean(dim=1)  # Mean pooling
#     return embeddings

for evidence in all_corpus[results[0][1]]:
    query = model.encode(all_synthetic_questions[0])
    against = model.encode(evidence)
    print(model.similarity(query,against))


In [None]:
all_corpus[results[0][1]]

In [65]:

# Compute similarity
query_embedding = encode(all_synthetic_questions[0][1])
doc_embeddings = torch.stack([encode(doc) for doc in all_corpus[results[0][1]]])  # Batch encode documents
cosine_similarities = torch.nn.functional.cosine_similarity(query_embedding, doc_embeddings)

In [None]:
cosine_similarities

In [None]:
# Tokenize all inputs and get lengths
input_lengths = [len(tokenizer(text)['input_ids']) for text in dataset]

# Analyze distribution
import numpy as np
print(f"Max input length: {max(input_lengths)}")
print(f"90th percentile input length: {np.percentile(input_lengths, 90)}")