# TF-IDF and BM25

In [1]:
from collections import Counter
from math import log
import re

In [2]:
# Heuristic tokenizer for demonstration purposes
def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)
    tokens = text.split()
    stopwords = {"the", "a", "an", "and", "or", "of", "in",
                 "on", "for", "to", "is", "are"}
    return [token for token in tokens if token not in stopwords]

In [3]:
# Index statistics
class CorpusStats:
    def __init__(self, documents):
        self.N = len(documents)
        self.document_tokens = [tokenize(doc) for doc in documents]
        self.document_length = [len(tokens) for tokens in self.document_tokens]
        self.avgdl = sum(self.document_length) / max(1, self.N)
        self.df = Counter()
        for tokens in self.document_tokens:
            for token in set(tokens):
                self.df[token] += 1

In [4]:
# TF-DF weights and scoring process
def idf_smooth(term, stats: CorpusStats):
    # smoothed idf: log((N+1) / (df+1))
    return log((stats.N + 1) / (stats.df.get(term, 0) + 1))


def tfidf_vector(tokens, stats: CorpusStats):
    tf = Counter(tokens)
    vector = {}
    for token, frequency in tf.items():
        vector[token] = (1 + log(frequency)) * idf_smooth(token, stats)
    return vector


# sparse dot product
def dot(a: dict, b: dict):
    if len(a) > len(b): a,b = b,a
    return sum(w * b.get(t, 0.0) for t, w in a.items())


def tfidf_score(query: str, document_index: int, stats: CorpusStats):
    qv = tfidf_vector(tokenize(query), stats)
    dv = tfidf_vector(stats.document_tokens[document_index], stats)
    return dot(qv, dv)

In [5]:
# BM25 score
def bm25_score(query: str, doc_idx: int, stats: CorpusStats, k1=1.5, b=0.7):
    q_terms = tokenize(query)
    tf = Counter(stats.document_tokens[doc_idx])
    score = 0.0
    dl = stats.document_length[doc_idx]
    for t in q_terms:
        if tf[t] == 0:
            continue
        idf = log((stats.N - stats.df.get(t, 0) + 0.5) / (stats.df.get(t, 0) + 0.5))
        denom = tf[t] + k1 * (1 - b + b * dl / max(1e-9, stats.avgdl))
        score += idf * (tf[t] * (k1 + 1)) / denom
    return score

In [6]:
# Demo scoring
docs = [
    "Guide to VAT reverse charge in Poland for B2B services",
    "Solar energy incentives and photovoltaic power regulations",
    "Understanding reverse charge mechanism for EU VAT rules",
    "Troubleshooting error code E1234 on GPU drivers",
    "Comprehensive overview of GDPR compliance for small businesses",
    "Beginner’s guide to neural networks and deep learning concepts",
    "Managing supply chain risk in global manufacturing industries",
    "Effective marketing automation strategies for e-commerce brands",
    "Impact of climate change on coastal erosion and flood defenses",
    "Setting up Kubernetes clusters on AWS for microservices",
    "Understanding ISO 27001 information security certification",
    "Optimizing SQL queries for better database performance",
    "Corporate income tax obligations for foreign subsidiaries",
    "Developing blockchain smart contracts using Solidity",
    "European Green Deal targets and sustainability reporting",
    "Best practices for remote team collaboration and productivity",
    "Machine learning approaches for credit risk assessment",
    "Introduction to quantum computing and Qiskit basics",
    "Employee data protection under EU labor regulations",
    "Financial forecasting with Python and time series models",
    "Troubleshooting slow website performance and SEO issues",
    "Renewable energy project financing and investment options",
    "Writing unit tests in JavaScript with Jest framework",
    "Understanding inflation trends in post-pandemic economies",
    "Guide to containerization and Docker image optimization",
    "Comparing renewable and fossil energy efficiency ratios",
    "Compliance checklist for import/export customs documentation",
    "Deploying machine learning models using Flask and FastAPI",
    "Healthcare data interoperability under HL7 and FHIR standards",
    "Investing in ETFs versus individual stocks: pros and cons",
    "AI-powered fraud detection in fintech applications",
    "Principles of agile software development and scrum sprints",
    "European patent application process and documentation",
    "Managing cloud costs and resource allocation in Azure",
    "Psychological impact of remote learning on students",
    "Understanding carbon offset programs and their limitations",
    "Best practices for cybersecurity incident response plans",
    "Ethical implications of AI-driven decision making",
    "Overview of the US SEC reporting requirements for IPOs",
    "Introduction to RESTful API design and HTTP methods",
    "Electric vehicle charging infrastructure regulations",
    "Predictive maintenance in industrial IoT environments",
    "Effective data visualization techniques using Tableau",
    "Corporate sustainability KPIs and ESG performance metrics",
    "Understanding fiscal policy and central bank interventions",
    "Cross-border e-commerce tax and customs considerations",
    "Deep reinforcement learning for autonomous systems",
    "User experience design principles for mobile apps",
    "Biodegradable packaging innovations in food industry",
    "Cloud data backup and disaster recovery best practices",
    "Introduction to digital twins and industrial simulation models",
]


In [7]:
stats = CorpusStats(docs)

In [8]:
query = "VAT reverse charge Poland"
for i, d in enumerate(docs):
    print(f"Doc {i} TF-IDF = {tfidf_score(query, i, stats):.4f} | BM25 = {bm25_score(query, i, stats):.4f} :: {d}")

Doc 0 TF-IDF = 35.0277 | BM25 = 11.7782 :: Guide to VAT reverse charge in Poland for B2B services
Doc 1 TF-IDF = 0.0000 | BM25 = 0.0000 :: Solar energy incentives and photovoltaic power regulations
Doc 2 TF-IDF = 24.4125 | BM25 = 8.4577 :: Understanding reverse charge mechanism for EU VAT rules
Doc 3 TF-IDF = 0.0000 | BM25 = 0.0000 :: Troubleshooting error code E1234 on GPU drivers
Doc 4 TF-IDF = 0.0000 | BM25 = 0.0000 :: Comprehensive overview of GDPR compliance for small businesses
Doc 5 TF-IDF = 0.0000 | BM25 = 0.0000 :: Beginner’s guide to neural networks and deep learning concepts
Doc 6 TF-IDF = 0.0000 | BM25 = 0.0000 :: Managing supply chain risk in global manufacturing industries
Doc 7 TF-IDF = 0.0000 | BM25 = 0.0000 :: Effective marketing automation strategies for e-commerce brands
Doc 8 TF-IDF = 0.0000 | BM25 = 0.0000 :: Impact of climate change on coastal erosion and flood defenses
Doc 9 TF-IDF = 0.0000 | BM25 = 0.0000 :: Setting up Kubernetes clusters on AWS for microservice