In [1]:
import os
from pathlib import Path
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import time

In [5]:
# --- Config ---
API_KEY_FILE = '/Users/devenderswami/GenAI/GenAI-NoteBooks/api_key.txt'
HTML_DIR = Path(r"/Users/devenderswami/GenAI/GenAI-NoteBooks/html_documents")
INDEX_NAME = "my-headings-index"

In [8]:
# Read API key
with open(API_KEY_FILE, 'r') as f:
     api_key = f.read().strip()
# Init Pinecone
pc = Pinecone(api_key=api_key)

In [10]:
# Create index if not exists
if INDEX_NAME not in [i.name for i in pc.list_indexes()]:
        pc.create_index(
        name=INDEX_NAME,
        dimension=384, # matches MiniLM output
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )

In [11]:
# Connect to index
index = pc.Index(INDEX_NAME)

In [12]:
# Load embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [14]:
# Loop through HTML files
for file_path in HTML_DIR.glob("*.html"):
    with open(file_path, "r", encoding="utf-8") as f:
       soup = BeautifulSoup(f.read(), "html.parser")

In [15]:
# Extract all headings (h1-h6)
headings = [h.get_text(strip=True) for h in soup.find_all(['h1','h2','h3','h4','h5','h6'])]
headings_text = " | ".join(headings) if headings else "No headings"

In [16]:
keywords = list({word.lower() for heading in headings for word in heading.split()})
# Create embedding for full document text (or you can use headings only)
text_for_embedding = " ".join(headings) if headings else soup.get_text(strip=True)
embedding = model.encode(text_for_embedding).tolist()

In [17]:
# Metadata
metadata = {
"file_name": file_path.name,
"headings": headings_text,
"created_at": int(time.time()),
"heading_keywords": keywords
}
# Upsert into Pinecone
index.upsert([
(file_path.stem, embedding, metadata)
])
print("Data inserted into Pinecone with headings as metadata.")

Data inserted into Pinecone with headings as metadata.


In [18]:
#+++++++++++++++++++++++++++++++++++++++ Filtering with MetaData +++++++++++++++++++++++++++++++++++++++

In [22]:
query = "Billing and Payment Policy"
query_embedding = model.encode(query).tolist()
search_results = index.query(
    vector=query_embedding,
    top_k=5,
    include_values=False,
    include_metadata=True,
    filter={
        "heading_keywords":{"$in":["billing"]}
    }
)
print("Search Results:")
for match in search_results["matches"]:
    print(f"ID:{match['id']}")
    print(f"Score:{match['score']}")
    print(f"Metadata:{match['metadata']}")
    

Search Results:
ID:credit_card_policy
Score:0.609861732
Metadata:{'created_at': 1762573447.0, 'file_name': 'credit_card_policy.html', 'heading_keywords': ['payment', 'policy', 'contact', 'prevention', 'credit', 'security', 'and', 'fraud', 'billing', 'rewards', 'card', 'cashback'], 'headings': 'Credit Card Billing and Payment Policy | Rewards and Cashback | Security and Fraud Prevention | Contact'}
