# Retrievers | BM25

In [34]:
!pip install --upgrade --quiet  rank_bm25 pypdf langchain-community

In [35]:
import re
import math
import numpy as np
from collections import Counter, defaultdict
from langchain_community.retrievers import BM25Retriever
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document

## Loading Data

In [36]:
loader = PyPDFLoader("/content/data/Understanding_Climate_Change.pdf")
docs = loader.load()
len(docs), type(docs)

(33, list)

In [37]:
vars(docs[0])['type']

'Document'

## Basic Sparse Retriever | BM25

In [38]:
retriever_bm25 = BM25Retriever.from_documents(docs)

In [39]:
results = retriever_bm25.invoke("What is climate change?")
len(results)

4

In [40]:
for i, result in enumerate(results):
    print(f"Document {i + 1}")
    print(result.page_content[:111], "...")
    print()

Document 1
Indigenous Knowledge and Leadership 
Role of Indigenous Knowledge 
Indigenous knowledge systems offer valuable  ...

Document 2
coordination across government agencies, stakeholder engagement, and regular monitoring 
and evaluation. 
Polic ...

Document 3
Coal is the most carbon-intensive fossil fuel, and its use for electricity generation is a major 
source of CO2 ...

Document 4
Community-based solutions leverage local knowledge and resources to address climate 
challenges. Examples inclu ...



## Self-Made BM25

Formula used for ranking documents in BM25   
```
Score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
```

Inverse Document Frequency :
```
IDF(qi) = log((N - n(qi) + 0.5) / (n(qi) + 0.5))
```
Where:  
- N is total number of documents
- n(qi) is number of documents containing term qi
- The +0.5 terms provide smoothing

Term Frequency Component :  
```
(f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
```
Where:  
- `f(qi,D)` is frequency of term qi in document D
- `k1` is a saturation parameter (typically 1.2), if high, more impact from higher freq.  
- `|D|` is document length
- `avgdl` is average document length
- `b` is length normalization parameter (typically 0.75), 0 means no & 1 means full length normalization.

### Implementation :

In [78]:
# default params
k1 = 1.2
b = 0.75
total_docs = 0
doc_lengths = []
doc_freqs = defaultdict(int)
doc_vectors = []
avgdl = 0

In [79]:
def extract_documents(docs):
    return [str(doc.page_content) for doc in docs]

In [80]:
def remove_special_characters(text):
    res = re.sub(r'[^\w\s]', '', text)
    return res.split()

In [81]:
def calculate_params(documents):
    global total_docs, doc_lengths, doc_freqs, doc_vectors, avgdl
    total_docs = len(documents)
    if total_docs == 0:
        return None

    # Calculate term and document frequencies and lengths
    for doc in documents:
        tokens = remove_special_characters(doc)
        doc_lengths.append(len(tokens))

        # Count Term frequencies
        term_freqs = Counter(tokens)
        doc_vectors.append(term_freqs)

        # Count document frequencies
        for term in term_freqs:
            doc_freqs[term] += 1

    # calculate average document length
    avgdl = sum(doc_lengths) / total_docs

In [82]:
def score(query, index):
    score = 0.0
    length = doc_lengths[index]
    doc_vector = doc_vectors[index]

    for token in query:
        if token not in doc_freqs:
            continue

        # IDF
        idf = math.log((total_docs - doc_freqs[token] + 0.5) /
                      (doc_freqs[token] + 0.5) + 1.0)

        # TF
        tf = doc_vector.get(token, 0)

        # Normalization
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * length / avgdl)

        score += idf * numerator / denominator

    return score

In [95]:
def customBM25(query, topk=3):
    query = remove_special_characters(query)
    documents = extract_documents(docs)
    calculate_params(documents)

    print("Tokens of query:")
    print(query)

    scores = []
    for i in range(total_docs):
        doc_score = score(query, i)
        scores.append((i, doc_score, documents[i]))

    print("Results:")
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for i, dscore, doc in scores[:topk]:
        print(f"Document {i + 1}")
        print(f"Score: {dscore}")
        print(f"Content: {doc[:200]}...")
        print()

In [96]:
customBM25("What are effects of Climate Change?", 5)

Tokens of query:
['What', 'are', 'effects', 'of', 'Climate', 'Change']
Results:
Document 33
Score: -3.763090532663562
Content: Legacy and Responsibility 
Recognizing the responsibility to future generations is a fundamental aspect of climate action. 
This involves making decisions that protect the environment and ensure a sus...

Document 9
Score: -5.698526266430585
Content: Captured CO2 can be used to produce building materials, synthetic fuels, and other products. 
This process not only reduces emissions but also creates value from waste CO2. Research 
into carbon utili...

Document 16
Score: -6.181471943092975
Content: empower communities to take action. Building social cohesion and support networks 
enhances community resilience. 
Climate-Smart Healthcare 
Integrating climate considerations into healthcare planning...

Document 21
Score: -6.22725444940628
Content: Green Economy 
A green economy prioritizes sustainability and reduces environmental risks. It encompasses 
renewable en