In [6]:
# You may need to install these libraries first:
# pip install sentence-transformers scikit-learn numpy

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- 1. Sample Data & Query ---
# chunks = [
#     "Applicants must be at least 21 years old.", # Relevant
#     "Minimum monthly salary is â‚¹25,000.", # Relevant
#     "Applicants must have a credit score above 700.", # Relevant
#     "The age requirement is that applicants must be at least 21.", # Redundant
#     "A credit score of 700 or higher is mandatory.", # Redundant
#     "Personal loans are not available to self-employed applicants.", # Slightly less relevant
#     "EMI defaults attract a penalty of 2% per month.", # Irrelevant
# ]
# query = "What are the age, salary, and credit score requirements for a personal loan?"
chunks = [
    # Relevant
    "Refunds are processed within 5 to 7 business days after product pickup.",
    "Customers can initiate a return request from the orders section in their profile.",
    "Refunds are credited to the original payment method once quality check is cleared.",

    # Redundant
    "After return approval, the money is refunded within one week to the same payment mode.",
    "Products can be returned through the user dashboard within seven working days.",
    "Refund will be sent to the same account that was used during purchase.",

    # Slightly relevant / contextual
    "Replacement requests are handled faster than refunds, depending on stock availability.",
    "In case of wrong product delivery, both pickup and re-shipment are arranged by the company.",
    "Customers can contact support if the return tracking ID is not active after 48 hours.",

    # Irrelevant
    "Office cafeteria will serve free lunch on Friday for the annual day celebration.",
    "The HR team announced a new work-from-home policy starting next quarter."
]

query = "How does the refund and return process work for customers?"
# --- 2. Embedding ---
model = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = model.encode(chunks)
query_embedding = model.encode([query])

In [7]:
# --- 3. Vanilla Top-k Retrieval ---
def vanilla_top_k(query_embedding, chunk_embeddings, chunks, k=3):
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_k_idx = similarities.argsort()[-k:][::-1]
    return [chunks[i] for i in top_k_idx], similarities[top_k_idx]

vanilla_results, vanilla_scores = vanilla_top_k(query_embedding, chunk_embeddings, chunks, k=3)
print("--- Vanilla Top-3 Results (Often Redundant) ---")
for text, score in zip(vanilla_results, vanilla_scores):
    print(f"  - (Score: {score:.2f}) {text}")


# --- 4. MMR-based Retrieval ---
def mmr(query_embedding, chunk_embeddings, chunks, k=3, lambda_param=0.5):
    """Returns k chunks using Maximum Marginal Relevance."""
    query_chunk_similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]

    selected_indices = []
    candidates_indices = list(range(len(chunks)))

    best_idx = np.argmax(query_chunk_similarities)
    selected_indices.append(best_idx)
    candidates_indices.remove(best_idx)

    for _ in range(k - 1):
        if not candidates_indices: break

        mmr_scores = []
        candidate_embeddings = chunk_embeddings[candidates_indices]

        for i, cand_idx in enumerate(candidates_indices):
            relevance_to_query = query_chunk_similarities[cand_idx]

            selected_embeddings = chunk_embeddings[selected_indices]
            max_similarity_to_selected = np.max(cosine_similarity(candidate_embeddings[i:i+1], selected_embeddings))

            score = lambda_param * relevance_to_query - (1 - lambda_param) * max_similarity_to_selected
            mmr_scores.append(score)

        best_candidate_idx = candidates_indices[np.argmax(mmr_scores)]
        selected_indices.append(best_candidate_idx)
        candidates_indices.remove(best_candidate_idx)

    return [chunks[i] for i in selected_indices]

mmr_results = mmr(query_embedding, chunk_embeddings, chunks, k=3, lambda_param=0.7)
print("\n--- MMR Top-3 Results (Relevant and Diverse) ---")
for text in mmr_results:
    print(f"  - {text}")

--- Vanilla Top-3 Results (Often Redundant) ---
  - (Score: 0.66) Refund will be sent to the same account that was used during purchase.
  - (Score: 0.64) Refunds are credited to the original payment method once quality check is cleared.
  - (Score: 0.61) Customers can initiate a return request from the orders section in their profile.

--- MMR Top-3 Results (Relevant and Diverse) ---
  - Refund will be sent to the same account that was used during purchase.
  - Customers can initiate a return request from the orders section in their profile.
  - Replacement requests are handled faster than refunds, depending on stock availability.
