<a href="https://colab.research.google.com/github/graceugochinneji/web-rag-intelligent-website-assistant-pro/blob/master/Webscraping_%2B_RAG_AI_Assistant_for_Education.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries:
# - langchain-community: community tools/integrations for LangChain
# - langchain-text-splitters: split docs into chunks for embeddings/RAG
# - chromadb: vector database for storing/retrieving embeddings
# - sentence-transformers: pretrained models for text embeddings
# - beautifulsoup4: parse/extract text from HTML/XML
!pip -q install -U langchain-community langchain-text-splitters chromadb sentence-transformers beautifulsoup4

In [None]:
# Crawl multiple pages on educosys.com, extract readable text, and keep only useful pages.

from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
from pathlib import Path
import re

# Seed URL to start crawling and how deep to follow links on the same site.
START_URL = "https://www.educosys.com"
MAX_DEPTH = 2  # Increase to 3–4 to cover more pages (more time/requests).

def bs4_extractor(html: str) -> str:
    """
    Convert raw HTML into clean, human-readable text.
    - Parses HTML with BeautifulSoup.
    - Removes non-content elements (scripts, styles, headers/footers/nav).
    - Joins text with newlines and collapses blank lines/whitespace.
    """
    soup = BeautifulSoup(html, "html.parser")
    for t in soup(["script", "style", "noscript", "header", "footer", "nav"]):
        t.decompose()
    txt = soup.get_text(separator="\n")
    return "\n".join(ln.strip() for ln in txt.splitlines() if ln.strip())

# RecursiveUrlLoader:
# - Starts at START_URL and follows internal links up to MAX_DEPTH.
# - Uses our bs4_extractor to clean each page to plain text.
# - Stays on the same domain (prevent_outside=True).
# - use_async=False is important in notebooks to avoid asyncio event-loop errors.
# - Excludes obvious non-HTML/binary paths (images, zip, etc.) by simple substring checks.
loader = RecursiveUrlLoader(
    url=START_URL,
    max_depth=MAX_DEPTH,
    extractor=bs4_extractor,
    prevent_outside=True,     # don't leave educosys.com
    use_async=False,          # safer in Colab/Jupyter (no asyncio.run inside a running loop)
    timeout=30,               # per-request timeout (seconds)
    check_response_status=True,
    exclude_dirs=["/logout", ".pdf", ".zip", ".jpg", ".jpeg", ".png", ".svg", ".gif", ".ico"],
)

# Fetch all raw pages found by the crawler (as LangChain Document objects).
docs_raw = loader.load()

# Post-filter the documents:
# - Keep only URLs that start with the target domain.
# - Drop binaries by extension (belt-and-suspenders).
# - Require a minimum amount of text (e.g., > 30 words) to skip tiny/empty pages.
docs = [
    d for d in docs_raw
    if d.metadata.get("source", "").startswith("https://www.educosys.com")
    and not re.search(r"\.(pdf|zip|jpg|jpeg|png|svg|gif|ico)$", d.metadata.get("source", ""), re.I)
    and len(d.page_content.split()) > 30
]

print(f"Kept {len(docs)} pages")


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter  # splits long docs into manageable, overlapping chunks (good for RAG)

# Create a splitter that prefers to cut on paragraphs/lines/words before falling back to characters.
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,    # max characters per chunk (≈ 300 tokens; tune to your model/context window)
    chunk_overlap=150,  # characters of overlap between consecutive chunks to preserve context continuity
    separators=["\n\n", "\n", " ", ""]  # split priority: paragraph → line → word → character
)

# Split a list of LangChain Documents into chunked Documents with updated metadata
chunks = splitter.split_documents(docs)

print(f"{len(chunks)} chunks")  # show how many chunks were produced


In [None]:
print(chunks[0])
print(chunks[1])
print(chunks[2])

In [None]:
print(f"{len(chunks)} chunks")

In [None]:
!pip -q install -U langchain-community langchain-text-splitters langchain-chroma sentence-transformers chromadb beautifulsoup4


In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings    # local (free) sentence-transformer embeddings
from langchain_community.vectorstores import Chroma                 # vector DB for storing/retrieving embeddings
# NOTE: In newer LangChain, prefer: `from langchain_chroma import Chroma`

# Instantiate an embeddings model.
# - all-MiniLM-L6-v2: lightweight, fast, good general-purpose semantic search.
# - model_kwargs selects GPU in Colab Pro (set to "cpu" if no GPU enabled).
emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={"device": "cuda"}  # change to "cpu" if you didn't enable a GPU runtime
)

# Build (or append to) a Chroma collection from your chunked documents.
# - collection_name: logical name of your dataset inside Chroma.
# - persist_directory: on-disk folder; allows reusing the index across sessions.
vectorstore = Chroma.from_documents(
    documents=chunks,                # list[Document] produced by your splitter
    embedding=emb,                   # embedding function defined above
    collection_name="educosys",      # name of your collection
    persist_directory="chroma_db"    # path to store the index files
)

# Create a retriever interface for RAG.
# - k=5: return top-5 most similar chunks per query (tune based on quality/latency).
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [None]:
# Works only inside Google Colab: provides no-key access to several LLMs.
from google.colab import ai

def rag_answer(question, k=5, max_chars=1200):
    """
    Retrieve top-k relevant chunks from your Chroma index and ask Colab's LLM
    to answer *using only that context*.

    Args:
        question (str): user question (e.g., "How many LLM courses are there?")
        k (int): number of chunks to include in the prompt (defaults to 5)
        max_chars (int): max characters per chunk to keep prompt compact
    Returns:
        reply (str): model's answer text
        docs  (List[Document]): retrieved documents (for debugging/attribution)
    """

    # Retrieve similar chunks from your vector store (top-k by similarity).
    docs = retriever.get_relevant_documents(question)

    # Edge case: nothing retrieved (empty index or no match).
    if not docs:
        return "I couldn't find relevant context in the index.", []

    # Build a compact, traceable context block limited to k docs and max_chars each.
    context = "\n\n".join(
        f"[{i+1}] {d.metadata.get('source','')}\n{d.page_content[:max_chars]}"
        for i, d in enumerate(docs[:k])  # ← respect k
    )

    # Constrain the model to ground its answer in the provided context only.
    prompt = (
        "You are a helpful assistant. Answer using ONLY the context.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    )

    # Generate an answer via Colab AI (no API key needed on Pro/Pro+ plans).
    reply = ai.generate_text(prompt)

    return reply, docs

# Example call (note: 'course' is spelled correctly)
answer, sources = rag_answer("How many LLM courses are there?")
print(answer)

print("\nSources:")
for s in sources:
    print("-", s.metadata.get("source",""))