### BASELINE-1 RAG [BM25 Algorithm]

imports

In [None]:
import os
import json
import re
from collections import Counter
import math

Loading the files

In [131]:
data_path="all_pdfs"
pdfs = {}
for root, _, files in os.walk(data_path):
    for file in files:
        if file.endswith('.txt'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                pdf = f.read()
                pdf = re.sub(r'\n\s*\n', '\n', pdf)
                pdf = re.sub(r'\s+', ' ', pdf)
                pdfs[file_path] = pdf

Indexing

In [None]:
index = {}
per_word = {}
total_length = 0
for path_file, pdf_text_val in pdfs.items():

    #total length
    pdf_text_val = pdf_text_val.lower()
    words = re.findall(r'\w+', pdf_text_val)
    total_length += len(words)
    per_word[path_file] = len(words)
    
    #frequency counter
    word_freq = Counter(words)
    for word, freq in word_freq.items():
        if word not in index:
            index[word] = {}
        index[word][path_file] = freq

avg_val = total_length / len(pdfs)

BM25 Algorithm (Best Match)

In [None]:
def bm25(term, path_file, index, per_word, avg_val,k1 = 1.5, b = 0.75):
    freq_u = index[term][path_file]
    size = per_word[path_file]
    normalized_length = size / avg_val
    
    N = len(per_word) 
    term_total_ = len(index[term])
    indx_freq = math.log((N - term_total_ + 0.5) / (term_total_ + 0.5) + 1)
    
    numr = freq_u * (k1 + 1)
    denom = freq_u + k1 * (1 - b + b * normalized_length)
    return indx_freq * numr / denom

Extract Content from PDF 

In [None]:
def get_text(contnt, query_terms, window_size = 100):
    words_from_pdf = contnt.split()
    pos_term = []
    
    for i, word in enumerate(words_from_pdf):
        if any(term in word.lower() for term in query_terms):
            pos_term.append(i)
    
    best_start = 0
    maxi = 0
    for i in range(len(words_from_pdf) - window_size):
        window_end = i + window_size
        terms_in_window = sum(1 for pos in pos_term if i <= pos < window_end)
        if terms_in_window > maxi:
            maxi = terms_in_window
            best_start = i
    
    text_starting  = max(0, best_start)
    end_of_document_text = min(len(words_from_pdf), best_start + window_size)
    tota_matched_text_document = ' '.join(words_from_pdf[text_starting :end_of_document_text])
    return tota_matched_text_document

In [None]:
def finding_matched_text(query, pdfs, index, per_word, avg_val, top_k = 3):
    query = query.lower()
    query_terms = re.findall(r'\w+', query)
    doc_scores= {}

    for term in query_terms:
        if term in index:
            for path_file in index[term]:
                score = bm25(term, path_file, index, per_word, avg_val)
                if path_file not in doc_scores:
                    doc_scores[path_file] = 0.0
                doc_scores[path_file] += score

    scored_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    
    results = []
    for path_file, score in scored_docs:
        if score > 0:
            tota_matched_text_document = get_text(
                pdfs[path_file],
                query_terms
            )
            results.append((path_file, score, tota_matched_text_document))
    
    return results

Queries load from Json

In [None]:
with open('all_queries.json', 'r') as f:
    queries = json.load(f)['queries']
results = []
for qry in queries:
    query = qry['query']
    docs_match_and_text = finding_matched_text(query, pdfs, index, per_word, avg_val)
    result_json = {
        "query": query,
        "results": []
    }
    if docs_match_and_text:
        for path_file, score, match in docs_match_and_text:
            filename = os.path.basename(path_file)
            result_json["results"].append({
                "filename": filename,
                "match": match
            })
    results.append(result_json)

Append to the Json file

In [137]:
with open('rag_baseline1.json', 'w', encoding='utf-8') as f:
        json.dump({"queries": results}, f, indent=4, ensure_ascii=False)