1. getAPIKey()
2. getURLs()
3. documents = load_url_contents(URLS)                    # WebBaseLoader
4. chunks = chunk_documents(documents)                    # RecursiveCharacterTextSplitter
5. vectorstore = embed_and_store(chunks, embeddings)      # Deduplicates, embeds, stores (Chroma)
6. retriever = build_retriever(vectorstore)               # vectorstore.as_retriever()
7. llm = initialize_llm(api_key)                          # ChatGoogleGenerativeAI
8. chain = create_rag_chain(retriever, llm)               # RetrievalQA
9. ask_questions(chain)                                   # Interactive Q&A
   ‚îî‚îÄ‚îÄ For each response:
       a. raw_context_chunks = response["source_documents"]
       b. filtered_chunks = filter_chunks(raw_context_chunks)
       c. pass user_query + filtered_chunks to LLM


In [6]:
GOOGLE_API_KEY = "AIzaSyDncmZUfSm9_hSrTWWn1gzpx8l13Q8F9UU"

In [7]:
import os
import sys
import hashlib
from langchain_community.document_loaders.web_base import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

# üîß CONFIG
VECTORSTORE_PATH = "chromadb_url_db"

URLS = [
    "https://en.wikipedia.org/wiki/Large_language_model",
    "https://en.wikipedia.org/wiki/Large_language_model#Multimodality",
    "https://en.wikipedia.org/wiki/Llama_(language_model)"
]
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or GOOGLE_API_KEY

# üîê Content hashing for deduplication
 
def hash_content(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()


def load_url_contents(urls: list):
    all_docs = []
    for url in urls:
        print(f"üîç Loading: {url}")
        loader = WebBaseLoader(url)
        docs = loader.load()
        
        # üîó Add source metadata
        for doc in docs:
            doc.metadata["source"] = url
        
        all_docs.extend(docs)
    return all_docs

 # ‚úÇÔ∏è Chunk documents
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    print("‚úÇÔ∏è Splitting into chunks...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)

# üßπ Deduplicate by content hash
def deduplicate_chunks(chunks, vectorstore):
    print("üßπ Deduplicating...")
    existing = vectorstore.get()
    existing_hashes = set(hash_content(doc) for doc in existing["documents"])

    unique_chunks = []
    for chunk in chunks:
        chunk_hash = hash_content(chunk.page_content)
        if chunk_hash not in existing_hashes:
            unique_chunks.append(chunk)

    print(f"‚úÖ Deduplicated {len(chunks) - len(unique_chunks)} entries out of {len(chunks)}.")
    return unique_chunks

# üìÇ Load or create Chroma vectorstore
def load_or_create_vectorstore(path, embeddings):
    if os.path.exists(path):
        print("üìÇ Loading existing vectorstore...")
    else:
        print("üÜï Creating new vectorstore...")
    return Chroma(persist_directory=path, embedding_function=embeddings)

# üì¶ Embed and store
def embed_and_store(chunks, embeddings, db_path=VECTORSTORE_PATH):
    vectorstore = load_or_create_vectorstore(db_path, embeddings)
    unique_chunks = deduplicate_chunks(chunks, vectorstore)
    if unique_chunks:
        print(f"‚úÖ {len(unique_chunks)} new chunks to embed.")
        vectorstore.add_documents(unique_chunks)
        vectorstore.persist()
    else:
        print("üü∞ No new unique content to embed.")
    return vectorstore

# üîç Retriever from vectorstore
def build_retriever(vectorstore):
    return vectorstore.as_retriever()

# ü§ñ Gemini LLM
def initialize_llm(api_key):
    return ChatGoogleGenerativeAI(
        model="gemini-1.5-pro-latest",
        google_api_key=api_key,
        temperature=0.7
    )

# üîÅ RAG Chain
def create_rag_chain(retriever, llm):
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

# üí¨ Q&A loop
# üí¨ Ask questions interactively
# üí¨ Ask questions interactively
def ask_questions(chain):
    print("\nüí¨ Ask me anything about the documents. Type 'exit' to quit.")
    while True:
        question = input("üß† Your question: ").strip()
        if question.lower() in ["exit", "quit"]:
            break
        try:
            response = chain.invoke({"query": question})
            print("\nüí° Answer:")
            print(response["result"])

            # üìå Print Source URLs
            source_docs = response.get("source_documents", [])
            if source_docs:
                print("\nüìö Source(s):")
                for doc in source_docs:
                    metadata = doc.metadata
                    url = metadata.get("source") or metadata.get("url")
                    if url:
                        print(f"üîó {url}")
            else:
                print("‚ö†Ô∏è No source documents returned.")

        except Exception as e:
            print(f"‚ùå Error during query: {e}")


 
# üöÄ MAIN
def main():
    print("\nüöÄ Starting Multi-URL RAG Pipeline...")
    documents = load_url_contents(URLS)
    chunks = chunk_documents(documents)
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=GOOGLE_API_KEY
    )
    vectorstore = embed_and_store(chunks, embeddings)
    retriever = build_retriever(vectorstore)
    llm = initialize_llm(GOOGLE_API_KEY)
    chain = create_rag_chain(retriever, llm)
    ask_questions(chain)

if __name__ == "__main__":
    main()



üöÄ Starting Multi-URL RAG Pipeline...
üîç Loading: https://en.wikipedia.org/wiki/Large_language_model
üîç Loading: https://en.wikipedia.org/wiki/Large_language_model#Multimodality
üîç Loading: https://en.wikipedia.org/wiki/Llama_(language_model)
‚úÇÔ∏è Splitting into chunks...
üìÇ Loading existing vectorstore...


  return Chroma(persist_directory=path, embedding_function=embeddings)


üßπ Deduplicating...
‚úÖ Deduplicated 279 entries out of 320.
‚úÖ 41 new chunks to embed.


  vectorstore.persist()



üí¨ Ask me anything about the documents. Type 'exit' to quit.


üß† Your question:  so summarize



üí° Answer:
Different attention heads in a model focus on different parts of the input sequence. For example, when processing the token "it_", one head might focus on the preceding words "The" and "animal," while another head focuses on the subsequent word "tired."

üìö Source(s):
üîó https://en.wikipedia.org/wiki/Large_language_model
üîó https://en.wikipedia.org/wiki/Large_language_model#Multimodality
üîó https://en.wikipedia.org/wiki/Large_language_model
üîó https://en.wikipedia.org/wiki/Large_language_model#Multimodality


üß† Your question:  exit


In [None]:
# util to clear vector store

In [None]:
import os
import shutil
import hashlib
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

CHROMA_DIR = "chromadb_store"

def hash_content(content):
    """Returns MD5 hash of stripped content for exact duplicate detection."""
    return hashlib.md5(content.strip().encode()).hexdigest()

def clear_vectorstore(mode="all", chroma_dir=CHROMA_DIR):
    """
    Utility to manage the Chroma vectorstore.

    Parameters:
    - mode: "all" or "deduplicate"
    - chroma_dir: directory path where Chroma index is stored
    """
    if mode == "all":
        if os.path.exists(chroma_dir):
            shutil.rmtree(chroma_dir)
            print("üßπ Vectorstore cleared completely (mode='all').")
        else:
            print("‚ÑπÔ∏è Vectorstore already empty.")
        return None

    elif mode == "deduplicate":
        try:
            api_key = os.getenv("GOOGLE_API_KEY") or GOOGLE_API_KEY
            embeddings = GoogleGenerativeAIEmbeddings(
                model="models/embedding-001",
                google_api_key=api_key
            )

            if not os.path.exists(chroma_dir):
                print("‚ö†Ô∏è Vectorstore folder not found. Nothing to deduplicate.")
                return None

            vs = Chroma(persist_directory=chroma_dir, embedding_function=embeddings)
            docs = vs.similarity_search("", k=1000)

            seen_hashes = set()
            unique_docs = []
            for doc in docs:
                content_hash = hash_content(doc.page_content)
                if content_hash not in seen_hashes:
                    seen_hashes.add(content_hash)
                    unique_docs.append(doc)

            print(f"üßπ Deduplicated {len(docs) - len(unique_docs)} entries out of {len(docs)}.")

            # Recreate Chroma store
            shutil.rmtree(chroma_dir, ignore_errors=True)
            new_vs = Chroma.from_documents(unique_docs, embeddings, persist_directory=chroma_dir)
            new_vs.persist()
            print("‚úÖ Deduplicated vectorstore saved.")

        except Exception as e:
            print(f"‚ùå Failed to deduplicate vectorstore: {e}")
    else:
        print("‚ùå Invalid mode. Use 'all' or 'deduplicate'.")


def count_vectorstore_documents(chroma_dir=CHROMA_DIR):
    """
    Prints and returns the number of documents in the Chroma vectorstore.

    Parameters:
    - chroma_dir: directory path where Chroma index is stored
    """
    try:
        api_key = os.getenv("GOOGLE_API_KEY") or GOOGLE_API_KEY
        embeddings = GoogleGenerativeAIEmbeddings(
            model="models/embedding-001",
            google_api_key=api_key
        )

        if not os.path.exists(chroma_dir):
            print("üìÇ No vectorstore found at the given path.")
            return 0

        vs = Chroma(persist_directory=chroma_dir, embedding_function=embeddings)
        # Count documents based on internal docstore
        count = len(vs._collection.get()["ids"])
        print(f"üìä Vectorstore document count: {count}")
        return count

    except Exception as e:
        print(f"‚ùå Failed to count documents: {e}")
        return 0


In [38]:
# Before deduplication
count_vectorstore_documents()

# Clear everything
#clear_vectorstore(mode="all")

clear_vectorstore(mode="deduplicate")



# After deduplication or any update
count_vectorstore_documents()


üìä Vectorstore document count: 1052
üßπ Deduplicated 825 entries out of 1000.
‚úÖ Deduplicated vectorstore saved.
üìä Vectorstore document count: 1227


1227

In [None]:
# TO CHECK FOR NOTEBOOKS THAT ARE STILL IN THE PROCESS THAT ARE LOCKING UP COMMON RESOURCES LIKE CHROMADB

import psutil

def check_open_handles(path="chromadb_store"):
    proc = psutil.Process()
    open_files = proc.open_files()
    locked = [f.path for f in open_files if path in f.path]
    print(f"üîç Open handles in use: {locked}")

check_open_handles()

In [None]:
# to remove TOC and other data that is not relevant , sometimes page numbers etc


In [8]:
import re

# Add more as needed
NOISE_PATTERNS = [
    r"^table of contents$", r"^contents$", r"^index$", r"^references$", 
    r"^page\s*\d+$", r"^\s*$", r"^introduction$", r"^chapter \d+", 
    r"^\d+$", r"^appendix$", r"^see also$", r"^summary$"
]

def is_noisy(text):
    """Returns True if the text matches noise patterns."""
    cleaned = text.strip().lower()
    for pattern in NOISE_PATTERNS:
        if re.match(pattern, cleaned):
            return True
    return False

def filter_retrieved_docs(docs):
    """Remove docs that are likely noise."""
    clean_docs = []
    for doc in docs:
        content = doc.page_content.strip()
        if not is_noisy(content) and len(content) > 30:  # Length filter optional
            clean_docs.append(doc)
    print(f"üßπ Filtered {len(docs) - len(clean_docs)} noisy docs.")
    return clean_docs


In [10]:
def ask_questions_filter(chain):
    print("\nüí¨ Ask me anything about the documents. Type 'exit' to quit.")
    while True:
        question = input("üß† Your question: ").strip()
        if question.lower() in ["exit", "quit"]:
            break
        try:
            response = chain.invoke({"query": question})
            
            # ‚úÖ Filter noisy chunks
            source_docs = response.get("source_documents", [])
            clean_docs = filter_retrieved_docs(source_docs)

            # üß† Optionally: Re-run the query with only clean_docs if needed
            print("\nüí° Answer:")
            print(response["result"])

            # üîó Show filtered source docs
            if clean_docs:
                print("\nüìö Filtered Source(s):")
                for doc in clean_docs:
                    url = doc.metadata.get("source") or doc.metadata.get("url")
                    snippet = doc.page_content[:100].replace("\n", " ")
                    print(f"üîó {url} | üìÑ {snippet}...")
            else:
                print("‚ö†Ô∏è All source documents were filtered as noise.")

        except Exception as e:
            print(f"‚ùå Error during query: {e}")




In [12]:
def main():
    print("\nüöÄ Starting Multi-URL RAG Pipeline...")
    URLS = [
    "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population",
    "https://en.wikipedia.org/wiki/Table_of_contents"
    ]
    
    documents = load_url_contents(URLS)
    chunks = chunk_documents(documents)
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=GOOGLE_API_KEY
    )
    vectorstore = embed_and_store(chunks, embeddings)
    retriever = build_retriever(vectorstore)
    llm = initialize_llm(GOOGLE_API_KEY)
    chain = create_rag_chain(retriever, llm)
    ask_questions_filter(chain)

if __name__ == "__main__":
    main()


üöÄ Starting Multi-URL RAG Pipeline...
üîç Loading: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population
üîç Loading: https://en.wikipedia.org/wiki/Table_of_contents
‚úÇÔ∏è Splitting into chunks...
üìÇ Loading existing vectorstore...
üßπ Deduplicating...
‚úÖ Deduplicated 0 entries out of 107.
‚úÖ 107 new chunks to embed.

üí¨ Ask me anything about the documents. Type 'exit' to quit.


üß† Your question:  talk to me about population and avoiding table of contents


üßπ Filtered 0 noisy docs.

üí° Answer:
This article discusses the table of contents and its usage but does not contain information about population.  Therefore, I'm unable to answer your question about population and avoiding the table of contents.

üìö Filtered Source(s):
üîó https://en.wikipedia.org/wiki/Table_of_contents | üìÑ Form[edit] The depth of detail in tables of contents depends on the length, complexity, and type of ...
üîó https://en.wikipedia.org/wiki/Table_of_contents | üìÑ Download as PDFPrintable version      		In other projects 	   Wikimedia CommonsWikidata item        ...
üîó https://en.wikipedia.org/wiki/Table_of_contents | üìÑ See also[edit]  Books portal    Wikimedia Commons has media related to Tables of contents.  Index (p...
üîó https://en.wikipedia.org/wiki/Table_of_contents | üìÑ In electronic documents[edit] Many popular word processors, such as Microsoft Word, WordPerfect, and...


üß† Your question:  exit


In [None]:
# A feature to List the kind of return value retrived ....


In [None]:
def retrieve_chunks_with_preview(query, retriever, k=5):
    results = retriever.invoke(query, config={"k": k})  # ‚úÖ updated from get_relevant_documents
    for i, doc in enumerate(results):
        print(f"\nChunk {i+1}:")
        print(doc.page_content[:500])  # Preview first 500 chars (optional)
    return results


In [16]:
def main():
    print("\nüöÄ Starting Multi-URL RAG Pipeline...")
    URLS = [
        "https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population",
        "https://en.wikipedia.org/wiki/Table_of_contents"
    ]
    
    documents = load_url_contents(URLS)
    chunks = chunk_documents(documents)
    
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001",
        google_api_key=GOOGLE_API_KEY
    )
    vectorstore = embed_and_store(chunks, embeddings)
    retriever = build_retriever(vectorstore)
    #llm = initialize_llm(GOOGLE_API_KEY)
    #chain = create_rag_chain(retriever, llm)
    
    # üîç Preview retrieved chunks before using the chain
    query = "What are the largest cities in the US?"
    print("\nüß© Previewing retrieved chunks:")
    _ = retrieve_chunks_with_preview(query, retriever, k=5)
    
    # üß† Then run the full RAG chain
    #ask_questions_filter(chain)

if __name__ == "__main__":
    main()


üöÄ Starting Multi-URL RAG Pipeline...
üîç Loading: https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population
üîç Loading: https://en.wikipedia.org/wiki/Table_of_contents
‚úÇÔ∏è Splitting into chunks...
üìÇ Loading existing vectorstore...
üßπ Deduplicating...
‚úÖ Deduplicated 107 entries out of 107.
üü∞ No new unique content to embed.

üß© Previewing retrieved chunks:

--- Chunk 1 ---
Edit links











ArticleTalk





English

















ReadView sourceView history







Tools





Tools
move to sidebar
hide



		Actions
	


ReadView sourceView history





		General
	


What links hereRelated changesUpload filePermanent linkPage informationCite this pageGet shortened URLDownload QR code





		Print/export
	


Download as PDFPrintable version





		In other projects
	


Wikidata item





















Appearance
move to sidebar
hide












From Wikipedia,

--- Chunk 2 ---
3
Other U.S. territories








4
Census-designated places








5
C

In [17]:
# EXCERCISE NOW TRY TO APPLY NOISE FILTERS AND CLEAN UP CHUCK BEFORE EMBEDDING OR GET MORE K responses and REMOVE CHUNKS THAT ARE NOISY

In [None]:
def is_noisy(text):
    cleaned = text.strip().lower()

    # Match known noise headings
    for pattern in NOISE_PATTERNS:
        if re.match(pattern, cleaned):
            return True

    # Remove chunks with too many short lines (menu-style junk)
    lines = cleaned.splitlines()
    short_lines = [line for line in lines if len(line.strip()) < 20]
    if len(short_lines) > 0.7 * len(lines):  # More than 70% short lines
        return True

    # Remove if too little alphanumeric content
    alpha_chars = sum(c.isalnum() for c in cleaned)
    if alpha_chars < 50:
        return True

    return False

def chunk_documents(docs):
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
    )

    all_chunks = []
    for doc in docs:
        splits = splitter.split_documents([doc])
        clean_splits = [chunk for chunk in splits if not is_noisy(chunk.page_content)]
        all_chunks.extend(clean_splits)
    
    print(f"‚úÖ Chunked {len(all_chunks)} clean chunks from {len(docs)} documents")
    return all_chunks
