In [13]:
import os
import re
import numpy as np
from collections import defaultdict, Counter
from math import log

# Preprocessing function
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

# Load documents
def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                docs[filename] = file.read()
    return docs

# Predefined queries
predefined_queries = {
    1: "vegetarian recipes with chickpeas and spinach",
    2: "Chicken Gravy",
    3: "Mutton Curry",
    4: "Pizza",
    5: "high-protein smoothie recipes"
}

# Load relevant documents (ground truth for each query)
def load_relevances(relevance_file_path):
    relevances = defaultdict(set)
    with open(relevance_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            query_id, doc_id = line.strip().split()
            relevances[int(query_id)].add(doc_id)
    return relevances

# Compute term frequencies and document lengths
def compute_tf_and_lengths(documents):
    tfs = {}
    doc_lengths = {}
    for doc_id, text in documents.items():
        tokens = preprocess(text)
        tf = Counter(tokens)
        tfs[doc_id] = tf
        doc_lengths[doc_id] = len(tokens)
    return tfs, doc_lengths

# Compute Inverse Document Frequencies (IDF)
def compute_idf(documents):
    N = len(documents)
    df = defaultdict(int)
    for text in documents.values():
        tokens = set(preprocess(text))
        for token in tokens:
            df[token] += 1
    idf = {}
    for term, freq in df.items():
        idf[term] = log((N - freq + 0.5) / (freq + 0.5) + 1)
    return idf

# Compute BM25 scores
def bm25_score(query, tfs, idf, doc_lengths, avg_doc_length, k1=1.5, b=0.75):
    query_terms = preprocess(query)
    scores = defaultdict(float)
    for term in query_terms:
        if term not in idf:
            continue
        for doc_id, tf in tfs.items():
            term_freq = tf.get(term, 0)
            if term_freq == 0:
                continue
            doc_len = doc_lengths[doc_id]
            numerator = idf[term] * term_freq * (k1 + 1)
            denominator = term_freq + k1 * (1 - b + b * (doc_len / avg_doc_length))
            scores[doc_id] += numerator / denominator
    return scores

# Calculate Average Precision (AP)
def calculate_average_precision(relevant_docs, ranked_docs):
    relevant_docs = set(relevant_docs)
    if not relevant_docs:
        return 0.0
    num_relevant = len(relevant_docs)
    score = 0.0
    num_hits = 0
    for i, (doc_id, _) in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            num_hits += 1
            score += num_hits / (i + 1)
    return score / num_relevant

# Rank documents with BM25 and calculate AP, saving results to 'result.txt'
def rank_documents_bm25(data_dir):
    """ BM25 ranking with Average Precision calculation and result saving. """
    # Load data
    documents = load_documents(os.path.join(data_dir, 'documents'))
    relevances = load_relevances(os.path.join(data_dir, 'relevance.txt'))

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)
    avg_doc_length = sum(doc_lengths.values()) / len(documents)

    # Compute IDF for the document collection
    idf = compute_idf(documents)

    results = {}
    total_ap = 0.0

    # Open result file
    with open('result.txt', 'w', encoding='utf-8') as result_file:
        # Score each predefined query against the documents
        for query_id, query in predefined_queries.items():
            # Compute BM25 scores
            scores = bm25_score(query, tfs, idf, doc_lengths, avg_doc_length)

            # Sort results
            ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

            # Calculate Average Precision (AP) for this query
            ap = calculate_average_precision(relevances.get(query_id, []), ranked_docs)
            total_ap += ap

            # Write results to file
            result_file.write(f"Query {query_id}: {query}\n")
            result_file.write(f"Average Precision: {ap:.4f}\n")
            result_file.write("Top 5 Documents:\n")
            for doc, score in ranked_docs[:5]:
                result_file.write(f"Document: {doc}, Score: {score:.4f}\n")
            result_file.write("\n")

        # Compute Mean Average Precision (MAP) across all queries
        mean_ap = total_ap / len(predefined_queries)
        result_file.write(f"Mean Average Precision (MAP): {mean_ap:.4f}\n")

    print("Results have been saved to 'result.txt'.")

# Path to the folder containing documents and relevance file
folder_path = '/content/Recipie ghar'
rank_documents_bm25(folder_path)


Results have been saved to 'result.txt'.
