<a href="https://colab.research.google.com/github/graceugochinneji/web-rag-intelligent-website-assistant-pro/blob/master/Project%202%20-%20InsuranceClaims-RAG-AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries for document processing, embeddings, and vector storage
# Core LangChain community integrations
# For splitting documents into smaller text chunks
# LangChain wrapper for Chroma vector database
# Pre-trained models for creating embeddings
# Pre-trained models for creating embeddings
# Chroma vector database for storing/retrieving embeddings
# PDF parsing and text extraction
# Extract text from Microsoft Word (.docx) files
# Extract text/content from PowerPoint (.pptx) files
# Parse and extract text from HTML/XML documents

!pip -q install -U \ langchain-community \ langchain-text-splitters \ langchain-chroma \ sentence-transformers \ chromadb \ pypdf \ docx2txt \ python-pptx \ beautifulsoup4


In [None]:
from google.colab import drive

# Mount Google Drive into the Colab environment
# This will prompt you to authorize access the first time you run it.
drive.mount('/content/drive')

# 👇 Define the path to your dataset directory in Google Drive
# Change this only if your dataset is saved in a different folder.
DATA_DIR = "/content/drive/MyDrive/ml_dataset"


In [None]:
from google.colab import drive

# Mount Google Drive into the Colab environment.
# This will create a link for you to authorize access the first time.
# After mounting, your Drive files will be accessible under '/content/drive'.
drive.mount('/content/drive')


In [None]:
from pathlib import Path
from typing import List
import re

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from pptx import Presentation  # python-pptx

# ----------------------------
# Load PDF documents
# ----------------------------
def load_pdf(path: Path) -> List[Document]:
    # PyPDFLoader extracts one Document per page
    loader = PyPDFLoader(str(path))
    docs = loader.load()
    for d in docs:
        # Add useful metadata to each page
        d.metadata.update({
            "source": str(path),       # full file path
            "filename": path.name,     # just the file name
            "ext": path.suffix.lower(),# file extension
            "week": path.parent.name   # parent folder (e.g., "Week 1")
        })
    return docs

# ----------------------------
# Load DOCX documents
# ----------------------------
def load_docx(path: Path) -> List[Document]:
    # Docx2txtLoader extracts all text as a single Document
    loader = Docx2txtLoader(str(path))
    docs = loader.load()
    for d in docs:
        d.metadata.update({
            "source": str(path),
            "filename": path.name,
            "ext": path.suffix.lower(),
            "week": path.parent.name
        })
    return docs

# ----------------------------
# Load PPTX documents
# ----------------------------
def load_pptx(path: Path) -> List[Document]:
    prs = Presentation(str(path))  # open PowerPoint
    docs = []
    for i, slide in enumerate(prs.slides, start=1):
        chunks = []
        # Collect all text from shapes on each slide
        for shape in slide.shapes:
            if hasattr(shape, "has_text_frame") and shape.has_text_frame:
                txt = "\n".join(
                    p.text for p in shape.text_frame.paragraphs if p.text
                )
                if txt.strip():
                    chunks.append(txt.strip())
        # Join the slide’s text into one chunk
        slide_text = "\n".join(chunks).strip()
        if slide_text:
            docs.append(
                Document(
                    page_content=slide_text,
                    metadata={
                        "source": str(path),
                        "filename": path.name,
                        "ext": path.suffix.lower(),
                        "slide": i,             # slide number
                        "week": path.parent.name
                    }
                )
            )
    return docs

# ----------------------------
# Load all documents from a directory
# ----------------------------
def load_all(base_dir: str) -> List[Document]:
    base = Path(base_dir)
    all_docs: List[Document] = []
    for p in base.rglob("*"):
        if not p.is_file():
            continue
        ext = p.suffix.lower()
        try:
            if ext == ".pdf":
                all_docs += load_pdf(p)
            elif ext == ".docx":
                all_docs += load_docx(p)
            elif ext == ".pptx":
                all_docs += load_pptx(p)
            # Other file types are ignored
        except Exception as e:
            print(f"[skip] {p.name}: {e}")
    return all_docs

# ----------------------------
# Run the loader
# ----------------------------
docs = load_all(DATA_DIR)

# Optional: filter out empty or very short pages (<10 words)
docs = [d for d in docs if len(d.page_content.split()) > 10]

print(f"Loaded {len(docs)} Documents")

# Quick peek at first 5 docs (metadata + preview text)
for d in docs[:5]:
    print(d.metadata, "→", d.page_content[:120].replace("\n"," "), "…")


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ----------------------------------------
# STEP 1: Split documents into smaller chunks
# ----------------------------------------
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,          # Maximum number of characters per chunk
    chunk_overlap=150,        # Overlap between chunks (helps retain context continuity)
    separators=["\n\n", "\n", " ", ""]  # Priority order for splitting text
)
chunks = splitter.split_documents(docs)
print(f"Chunks: {len(chunks)}")

# ----------------------------------------
# STEP 2: Load Embedding Model
# ----------------------------------------
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Use GPU if available, otherwise fallback to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# HuggingFace sentence-transformer for embedding text into vectors
emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # Lightweight, fast, good quality
    model_kwargs={"device": device}                      # Run on GPU/CPU as detected
)

# ----------------------------------------
# STEP 3: Create Vector Database with Chroma
# ----------------------------------------
vectorstore = Chroma.from_documents(
    documents=chunks,           # Chunked documents to embed & index
    embedding=emb,              # Embedding model
    collection_name="project_two",   # Name of your vector collection
    persist_directory="chroma_db"    # Local directory for saving index
)

# ----------------------------------------
# STEP 4: Setup Retriever for Search
# ----------------------------------------
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# "k=5" → return top 5 most relevant chunks for each query

print("Chroma index ready.")


In [None]:
from google.colab import ai

# ----------------------------------------
# Utility function to wrap long text
# ----------------------------------------
def wrap_text(text, words_per_line=15):
    """
    Breaks text into lines of a fixed number of words
    for easier readability in Colab outputs.

    Args:
        text (str): The input text to wrap.
        words_per_line (int): Number of words before inserting a line break.

    Returns:
        str: Wrapped text with line breaks.
    """
    words = text.split()
    lines = []
    # Create chunks of 'words_per_line'
    for i in range(0, len(words), words_per_line):
        lines.append(" ".join(words[i:i+words_per_line]))
    return "\n".join(lines)


# ----------------------------------------
# Main Q&A function
# ----------------------------------------
def ask(question, k=5, max_chars=1200):
    """
    Retrieves context documents, builds a prompt,
    queries the AI model, and wraps the answer.

    Args:
        question (str): The user question.
        k (int): Number of top documents to retrieve.
        max_chars (int): Maximum characters of each document to include in context.

    Returns:
        tuple: (wrapped answer string, list of context documents)
    """
    # Retrieve top-k relevant documents
    ctx_docs = retriever.get_relevant_documents(question)

    # Build the context string (include file + week metadata)
    context = "\n\n".join(
        f"[{i+1}] {d.metadata.get('filename')} ({d.metadata.get('week')})\n{d.page_content[:max_chars]}"
        for i, d in enumerate(ctx_docs)
    )

    # Construct the AI prompt
    prompt = (
        "You are a helpful assistant. Answer ONLY from the context.\n\n"
        f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
    )

    # Generate AI response
    result = ai.generate_text(prompt)

    # Wrap the response text for readability
    return wrap_text(result), ctx_docs


# ----------------------------------------
# Example usage
# ----------------------------------------
answer, sources = ask("What is the differences between week 1 and week 2 content?")

# Print formatted answer
print(answer)

# Print document sources used for the answer
print("\nSources:")
for s in sources:
    print("-", s.metadata.get("filename"), "|", s.metadata.get("week"))
