In [4]:
import faiss
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer

In [43]:
documents = [
    "The new smartphone features an advanced AI camera for low-light photography.",
    "Healthy eating includes fruits, vegetables, and whole grains.",
    "Machine learning algorithms can detect fraudulent transactions in banking.",
    "Regular exercise improves mental health and boosts energy levels."

]

In [44]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [45]:
# --- Step 2: Dense embeddings (FAISS index) ---
embeddings = model.encode(documents).astype('float32')
faiss.normalize_L2(embeddings)
index = faiss.IndexHNSWFlat(embeddings.shape[1], 16, faiss.METRIC_INNER_PRODUCT)
index.add(embeddings)

In [46]:
# --- Step 3: Sparse BM25 index (automatic scoring) ---
tokenized_corpus = [doc.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus) #This creates a BM25 search index from your tokenized documents.

In [47]:
print(tokenized_corpus)

[['the', 'new', 'smartphone', 'features', 'an', 'advanced', 'ai', 'camera', 'for', 'low-light', 'photography.'], ['healthy', 'eating', 'includes', 'fruits,', 'vegetables,', 'and', 'whole', 'grains.'], ['machine', 'learning', 'algorithms', 'can', 'detect', 'fraudulent', 'transactions', 'in', 'banking.'], ['regular', 'exercise', 'improves', 'mental', 'health', 'and', 'boosts', 'energy', 'levels.']]


In [48]:
print(bm25)

<rank_bm25.BM25Okapi object at 0x13e7ba780>


In [49]:
# --- Step 4: Query and retrieval ---
query = "How does AI help in detecting fraud in banks?"
query_vector = model.encode([query]).astype('float32')
faiss.normalize_L2(query_vector)

In [50]:
# Dense retrieval
dense_k = 3
distances, indices = index.search(query_vector, dense_k)
dense_candidates = indices[0].tolist() #converts the NumPy array ‚Üí Python list:

In [51]:
print(indices)

[[2 0 3]]


In [52]:
print(distances)

[[ 0.72809994  0.10630827 -0.0809403 ]]


In [53]:
print(dense_candidates)

[2, 0, 3]


In [54]:
# BM25 rerank automatically
bm25_scores = bm25.get_scores(query.lower().split())

In [55]:
print(bm25_scores)

[0.78082244 0.         1.71545942 0.        ]


In [62]:
# Combine (simple hybrid weighting)
# semantic (dense) scores from FAISS and the keyword (sparse) scores from BM25 into one unified ranking.
alpha = 1  # 0 = pure dense, 1 = pure BM25 alpha controls how much weight you give to BM25 (keyword match) vs FAISS (semantic match).
hybrid_scores = [] #store a tuple (doc_index, combined_score) for each document here.
for i, doc in enumerate(documents):
    dense_score = (1 - alpha) * (1.0 if i in dense_candidates else 0.0)
    hybrid_score = alpha * bm25_scores[i] + dense_score
    hybrid_scores.append((i, hybrid_score))

In [63]:
# --- Step 5: Sort and display ---
#Sorts the list of tuples
#Sorts based on the second value of each tuple ‚Üí the score
#Sorts from highest to lowest (since higher score = more relevant)
#Takes only the top 2 results
final_results = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:2]

print(f"\nüîç Query: '{query}'")
print("\nTop Results:")
for rank, (idx, score) in enumerate(final_results, start=1):
    print(f"{rank}. [{score:.4f}] {documents[idx]}")


üîç Query: 'How does AI help in detecting fraud in banks?'

Top Results:
1. [1.7155] Machine learning algorithms can detect fraudulent transactions in banking.
2. [0.7808] The new smartphone features an advanced AI camera for low-light photography.


In [68]:
import os

current_path = os.getcwd()
print(current_path)

/Users/devenderswami
