In [3]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp313-cp313-macosx_14_0_arm64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m2.6 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.0.1-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-12.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.8 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.36.0-py

In [10]:
import os,time,psutil,numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [11]:
print("FAISS:",faiss.__version__)

FAISS: 1.12.0


In [33]:
#++++++++++++++++++Embedding the documents +++++++++++++++++

In [34]:
from pathlib import Path
from bs4 import BeautifulSoup
import numpy as np
from sentence_transformers import SentenceTransformer

In [35]:
HTML_DIR = Path("/Users/devenderswami/GenAI/GenAI-NoteBooks/html_documents")

In [48]:
# Read Html Files -> Extract Plain Text
file_paths = sorted(HTML_DIR.glob("*.html"))
texts = []
doc_ids = [] #string IDs (e.g., file stems)
file_names = [] #optional : full file names

for fp in file_paths:
    html = fp.read_text(encoding="utf-8",errors="ignore")
    soup = BeautifulSoup(html,"html.parser")
    text = soup.get_text(" ",strip=True)
    if not text:
        continue
    texts.append(text)
    doc_ids.append(fp.stem)
    file_names.append(fp.name)

print("text",texts)
print("docid",doc_ids)
print("file_names",file_names)

text ['Credit Card Usage Policy Credit Card Billing and Payment Policy Customers must ensure timely payment of their credit card bills to avoid late payment fees and interest charges. The due date for payment will be mentioned on every monthly statement. Partial payments will attract interest on the remaining balance until full payment is made. Rewards and Cashback Each purchase made using the credit card earns reward points or cashback as per the ongoing offer. Reward points can be redeemed for vouchers, merchandise, or converted into statement credit. Security and Fraud Prevention Never share your OTP or CVV with anyone, including bank officials. Immediately report any unauthorized transaction to the bank’s helpline. Lost cards should be blocked instantly using the mobile app or helpline number. Contact For billing disputes or reward redemption queries, email support@creditbank.com or call 1800-900-999.', 'Personal Loan Policy Policy on Pre-Closing a Personal Loan Customers may pre-c

In [37]:
# Create embeddings (matches your previous MiniLM choice)
# texts :A list of strings (your cleaned HTML text).
# convert_to_numpy=True : Returns the output as a NumPy array instead of PyTorch tensors.
# normalize_embeddings=False:If True, embeddings are scaled to unit length (good for cosine). Here it’s left False because FAISS can handle normalization separately.
# .astype("float32") Converts data type from 64-bit floats to 32-bit — required by FAISS for speed and memory efficiency.

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
emb = model.encode(
    texts,convert_to_numpy=True,normalize_embeddings=False
).astype("float32")
emb = np.ascontiguousarray(emb)

In [38]:
#3 Shapes /basic sanity
N,d = emb.shape
print("Docs embedded:",N,"| Dim:",d)

Docs embedded: 2 | Dim: 384


In [39]:
#4 (Optional) Map your string IDs to int64 for FAISS IndexIDMap2
#This creates a unique numeric ID (int64) for each document string ID (like file name) — because FAISS needs numeric IDs, not text.
ids_int64 = np.array([int(hash(s)) & 0x7FFFFFFF for s in doc_ids], dtype="int64")
                      

In [40]:
# Now you have:
# emb: (N, d) float32 contiguous embeddings from your HTML docs
# doc_ids: list of string IDs (file stems)
# file_names: list of file names (for readability in results)
# ids_int64: deterministic int64 IDs ready for FAISS add_with_ids(...)

In [41]:
#++++++++++++++++++Upserting into FAISS-HNSW +++++++++++++++++

In [47]:
#use cosine via IP on normalized vectors
emb_norm = emb.copy()
faiss.normalize_L2(emb_norm) #in-place L2 normalization
M = 32 #graph degree (higher -> better recall more memory)
#if your faiss build supports metric param , prefer METRIC_INNER_PRODUCT:
#hnsw = faiss.IndexHNSWFlat(d,M,faiss.METRIC_INNER_PRODUCT)
#otherwise this will also works with normalize vectors
hnsw = faiss.IndexHNSWFlat(d,M)
#Turning knobs
hnsw.hnsw.efConstruction = 200  #build time accuracy/ speed trade-off
hnsw.hnsw.efSearch = 64 #query-time recall/speed trade-off

#Map your Customer IDs 
hnsw_idmap = faiss.IndexIDMap2(hnsw)
hnsw_idmap.add_with_ids(emb_norm,ids_int64)
print("HNSW total vectors",hnsw_idmap.ntotal)

HNSW total vectors 2


In [43]:
#++++++++++++++++++Querying into FAISS-HNSW +++++++++++++++++

In [53]:
query_text = "Prepayment is allowed only after how many EMI's"
## encode query in vector
q = model.encode([query_text],# wrap in a list for batch shape(1,d)
                  convert_to_numpy = True,
                  normalize_embeddings=False #keep false we will normalize manually
                ).astype("float32")
# normalize the vector
faiss.normalize_L2(q)
#run the search
top_k = 5
D,I = hnsw_idmap.search(q,top_k) # D: Similarity scores, I: int64 IDs
#Map IDs back to filenames and show result
id_to_file = {ids_int64[i]: file_names[i] for i in range(len(file_names))}
for rank, (pid,score) in enumerate(zip(I[0].tolist(),D[0].tolist()),start=1):
    print(f"{rank}. id={pid} score={round(score,4)} file={id_to_file.get(pid)}")
                                   


1. id=566911232 score=1.1461 file=sample_policy.html
2. id=452395549 score=1.7896 file=credit_card_policy.html
3. id=-1 score=3.4028234663852886e+38 file=None
4. id=-1 score=3.4028234663852886e+38 file=None
5. id=-1 score=3.4028234663852886e+38 file=None


In [54]:
#++++++++++++++ Upserting IVF ++++++++++++++

In [58]:
# Assume you already have:
# emb -> (N, d) float32 embeddings from your HTML docs
# ids_int64 -> np.int64 IDs matching docs
# d -> embedding dimension
# ---------------------------
# 1. Normalize (for cosine search)
# ---------------------------
emb_norm = emb.copy()
faiss.normalize_L2(emb_norm)
# ---------------------------
# 2. Create IVF index
# ---------------------------
#nlist = min(64,max(8,len(ids_int64)//5)) # number of coarse clusters
nlist = min(2, max(1, len(ids_int64)//2))
quantizer = faiss.IndexFlatIP(d)# quantizer (flat IP for cosine)
# IVF index with Flat storage
ivf = faiss.IndexIVFFlat(
quantizer,
d,
nlist,faiss.METRIC_INNER_PRODUCT)
# ---------------------------
# 3. Train the IVF index
# ---------------------------
# Training is required before adding vectors
ivf.train(emb_norm)
# ---------------------------
# 4. Wrap with IDMap and add vectors
# ---------------------------
ivf_idmap = faiss.IndexIDMap(ivf)
ivf_idmap.add_with_ids(emb_norm,ids_int64)
# ---------------------------
# 5. Set query-time parameter
# ---------------------------
ivf_idmap.nprobe = min(8,nlist) #number of cluster to search at query time
print("IVF total vectors:", ivf_idmap.ntotal)

IVF total vectors: 2




In [59]:
#++++++++++++++ Querying IVF ++++++++++++++

In [73]:
# =============================
# 1 -- Prepare the query text
# =============================
query_text = "AI in simulations"
# =============================
# 2 -- Encode query into a vector
# =============================
q = model.encode(
[query_text], # batch shape (1, d)
convert_to_numpy=True,
normalize_embeddings=False # we'll normalize manually
).astype("float32")
# =============================
# 3 -- Normalize the vector
# =============================
# For cosine similarity with IVF (IP), normalize query too
faiss.normalize_L2(q)
# =============================
# 4 -- (Optional) tune IVF query breadth
# =============================
# nprobe = how many coarse clusters to search; higher -> better recall,slower
# You likely set this when building the index; can tweak here if needed
nlist = min(2, max(1, len(ids_int64)//2))
#ivf_idmap.nprobe = min(8, ivf_idmap.nlist) # e.g., 8; try 16/32 for higher recall
#ivf_idmap.index.nprobe = min(2,nlist)
# =============================
# 5 -- Run the search
# =============================
top_k = 5
D, I = ivf_idmap.search(q, top_k) # D: similarity scores, I: int64 IDs
# =============================
# 6 -- Map IDs back to filenames and show results
# =============================
id_to_file = {ids_int64[i]: file_names[i] for i in range(len(file_names))}
for rank, (pid, score) in enumerate(zip(I[0].tolist(), D[0].tolist()),start=1):
  print(f"{rank}. id={pid} score={round(score,4)}file={id_to_file.get(pid)}")

1. id=566911232 score=0.0521file=sample_policy.html
2. id=452395549 score=0.0079file=credit_card_policy.html
3. id=-1 score=-3.4028234663852886e+38file=None
4. id=-1 score=-3.4028234663852886e+38file=None
5. id=-1 score=-3.4028234663852886e+38file=None
