# Retrievers | BM25

In [97]:
!pip install --upgrade --quiet  rank_bm25 pypdf langchain-community

In [98]:
import re
import math
import numpy as np
from collections import Counter, defaultdict
from langchain_community.retrievers import BM25Retriever
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document

## Loading Data

In [99]:
loader = PyPDFLoader("/content/data/Understanding_Climate_Change.pdf")
docs = loader.load()
len(docs), type(docs)

(33, list)

In [100]:
vars(docs[0])['type']

'Document'

## Basic Sparse Retriever | BM25

In [101]:
retriever_bm25 = BM25Retriever.from_documents(docs)

In [102]:
results = retriever_bm25.invoke("What is climate change?")
len(results)

4

In [103]:
for i, result in enumerate(results):
    print(f"Document {i + 1}")
    print(result.page_content[:111], "...")
    print()

Document 1
Indigenous Knowledge and Leadership 
Role of Indigenous Knowledge 
Indigenous knowledge systems offer valuable  ...

Document 2
coordination across government agencies, stakeholder engagement, and regular monitoring 
and evaluation. 
Polic ...

Document 3
Coal is the most carbon-intensive fossil fuel, and its use for electricity generation is a major 
source of CO2 ...

Document 4
Community-based solutions leverage local knowledge and resources to address climate 
challenges. Examples inclu ...



## Self-Made BM25

Formula used for ranking documents in BM25   
```
Score(D,Q) = Σ IDF(qi) × (f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
```

Inverse Document Frequency :
```
IDF(qi) = log((N - n(qi) + 0.5) / (n(qi) + 0.5))
```
Where:  
- N is total number of documents
- n(qi) is number of documents containing term qi
- The +0.5 terms provide smoothing

Term Frequency Component :  
```
(f(qi,D) × (k1 + 1)) / (f(qi,D) + k1 × (1 - b + b × |D|/avgdl))
```
Where:  
- `f(qi,D)` is frequency of term qi in document D
- `k1` is a saturation parameter (typically 1.2), if high, more impact from higher freq.  
- `|D|` is document length
- `avgdl` is average document length
- `b` is length normalization parameter (typically 0.75), 0 means no & 1 means full length normalization.

### Implementation :

In [104]:
# default params
k1 = 1.2
b = 0.75
total_docs = 0
doc_lengths = []
doc_freqs = defaultdict(int)
doc_vectors = []
avgdl = 0

In [None]:
def extract_documents(docs):
    return [str(doc.page_content) for doc in docs]

In [105]:
def remove_special_characters(text):
    text = text.lower()
    res = re.sub(r'[^\w\s]', '', text)
    return res.split()

In [106]:
def calculate_params(documents):
    global total_docs, doc_lengths, doc_freqs, doc_vectors, avgdl
    total_docs = len(documents)
    if total_docs == 0:
        return None

    # Calculate term and document frequencies and lengths
    for doc in documents:
        tokens = remove_special_characters(doc)
        doc_lengths.append(len(tokens))

        # Count Term frequencies
        term_freqs = Counter(tokens)
        doc_vectors.append(term_freqs)

        # Count document frequencies
        for term in term_freqs:
            doc_freqs[term] += 1

    # calculate average document length
    avgdl = sum(doc_lengths) / total_docs

In [107]:
def score(query, index):
    score = 0.0
    length = doc_lengths[index]
    doc_vector = doc_vectors[index]

    for token in query:
        if token not in doc_freqs:
            continue

        # IDF
        idf = math.log((total_docs - doc_freqs[token] + 0.5) /
                      (doc_freqs[token] + 0.5) + 1.0)

        # TF
        tf = doc_vector.get(token, 0)

        # Normalization
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * length / avgdl)

        score += idf * numerator / denominator

    return score

In [108]:
def customBM25(query, topk=3):
    query = remove_special_characters(query)
    documents = extract_documents(docs)
    calculate_params(documents)

    print("Tokens of query:")
    print(query)

    scores = []
    for i in range(total_docs):
        doc_score = score(query, i)
        scores.append((i, doc_score, documents[i]))

    print("Results:")
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for i, dscore, doc in scores[:topk]:
        print(f"Document {i + 1}")
        print(f"Score: {dscore}")
        print(f"Content: {doc[:200]}...")
        print()

In [109]:
customBM25("What are effects of Climate Change?", 5)

Tokens of query:
['what', 'are', 'effects', 'of', 'climate', 'change']
Results:
Document 3
Score: 4.019183721093634
Content: Ruminant animals, such as cows and sheep, produce methane during digestion. Manure 
management practices also contribute to methane and nitrous oxide emissions. Innovations in 
livestock feeding and w...

Document 20
Score: 2.8071396519240333
Content: Indigenous Knowledge and Leadership 
Role of Indigenous Knowledge 
Indigenous knowledge systems offer valuable insights into sustainable land and resource 
management. Indigenous practices, such as co...

Document 7
Score: 2.7955653691352964
Content: Countries implement various policies to meet their climate goals, including carbon pricing, 
renewable energy incentives, and emissions regulations. National strategies must align with 
global targets...

Document 13
Score: 0.5567474760298794
Content: large-scale climate solutions. PPPs are particularly effective in areas such as infrastructure 
development, renewable e