In [1]:
# notebooks/01_markdown_eda.ipynb
# Exploratory Data Analysis for Markdown Documents

# ==============================
# 1. Setup & Imports
# ==============================
import os
import re
from pathlib import Path
from collections import defaultdict

from langchain_community.document_loaders import TextLoader


# ==============================
# 2. Configuration
# ==============================
MARKDOWN_DIR = Path("data/markdown_docs")

# Boilerplate phrases to remove (extend carefully)
BOILERPLATE_PATTERNS = [
    r"Lecture note prepared by.*",
    r"Prepared by.*",
]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# ==============================
# 3. Utility Functions
# ==============================

def light_clean_markdown(text: str) -> str:
    """
    Light cleaning only:
    - Remove extra blank lines
    - Remove known boilerplate phrases
    - Preserve code blocks and markdown syntax
    """
    # Remove boilerplate lines
    for pattern in BOILERPLATE_PATTERNS:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE)

    # Normalize excessive blank lines (3+ -> 2)
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


def count_words(text: str) -> int:
    return len(re.findall(r"\b\w+\b", text))


def extract_code_blocks(text: str):
    return re.findall(r"```[\s\S]*?```", text)


def extract_sections(text: str):
    """Split markdown by headings (#, ##, ###...)"""
    sections = re.split(r"(?=^#{1,6}\s)", text, flags=re.MULTILINE)
    return [s.strip() for s in sections if s.strip()]


In [3]:

# ==============================
# 4. Load Markdown Files
# ==============================
markdown_files = list(MARKDOWN_DIR.glob("*.md"))

print(f"Number of Markdown files: {len(markdown_files)}")

raw_documents = []
for md_file in markdown_files:
    loader = TextLoader(str(md_file), encoding="utf-8")
    docs = loader.load()
    raw_documents.extend(docs)


Number of Markdown files: 5


In [4]:

# ==============================
# 5. Exploratory Analysis
# ==============================
file_stats = []
section_analysis = []

for doc in raw_documents:
    raw_text = doc.page_content
    cleaned_text = light_clean_markdown(raw_text)

    word_count = count_words(cleaned_text)
    code_blocks = extract_code_blocks(cleaned_text)
    sections = extract_sections(cleaned_text)

    file_stats.append({
        "source": doc.metadata.get("source", "unknown"),
        "word_count": word_count,
        "num_code_blocks": len(code_blocks),
        "num_sections": len(sections),
    })

    for sec in sections:
        sec_words = count_words(sec)
        sec_code_blocks = extract_code_blocks(sec)

        section_analysis.append({
            "source": doc.metadata.get("source", "unknown"),
            "section_preview": sec[:80].replace("\n", " ") + "...",
            "word_count": sec_words,
            "code_blocks": len(sec_code_blocks),
        })


In [5]:

# ==============================
# 6. Aggregate Metrics
# ==============================
total_words = sum(f["word_count"] for f in file_stats)
avg_words = total_words / len(file_stats) if file_stats else 0

print(f"Total word count: {total_words}")
print(f"Average words per file: {avg_words:.2f}")


Total word count: 16364
Average words per file: 3272.80


In [6]:

# ==============================
# 7. Identify Notable Sections
# ==============================
LONG_SECTION_THRESHOLD = 300
CODE_HEAVY_RATIO = 0.5

long_sections = [
    s for s in section_analysis
    if s["word_count"] >= LONG_SECTION_THRESHOLD
]

code_dominated_sections = [
    s for s in section_analysis
    if s["code_blocks"] > 0 and s["code_blocks"] >= CODE_HEAVY_RATIO * max(1, s["word_count"])
]

print(f"Sections with long explanations (>{LONG_SECTION_THRESHOLD} words): {len(long_sections)}"),print(f"Sections dominated by code blocks: {len(code_dominated_sections)}")


Sections with long explanations (>300 words): 6
Sections dominated by code blocks: 0


(None, None)

In [7]:

# ==============================
# 8. Sample Outputs (Inspection)
# ==============================
print("\nSample file stats:")
for f in file_stats[:3]:
    print(f)

print("\nSample long sections:")
for s in long_sections[:3]:
    print(s)

# ==============================
# 9. Next Step (RAG-ready Output)
# ==============================
# At this point, cleaned_text per document can be:
# - Chunked
# - Embedded
# - Stored in a vector database
# without losing markdown structure or code blocks



Sample file stats:
{'source': 'data\\markdown_docs\\boot2.md', 'word_count': 469, 'num_code_blocks': 5, 'num_sections': 20}
{'source': 'data\\markdown_docs\\boot3.md', 'word_count': 325, 'num_code_blocks': 6, 'num_sections': 21}
{'source': 'data\\markdown_docs\\module3.md', 'word_count': 3007, 'num_code_blocks': 0, 'num_sections': 142}

Sample long sections:
{'source': 'data\\markdown_docs\\note.md', 'section_preview': '### USER EXPERIENCE DESIGN SOLUTIONS  ``` Please don’t copy the content without ...', 'word_count': 1574, 'code_blocks': 86}
{'source': 'data\\markdown_docs\\note.md', 'section_preview': '### USER EXPERIENCE DESIGN SOLUTIONS  UNIT 2 : PLANNING WEBSITE DESIGN  Please d...', 'word_count': 1516, 'code_blocks': 76}
{'source': 'data\\markdown_docs\\note.md', 'section_preview': '### USER EXPERIENCE DESIGN SOLUTIONS  ``` Please don’t copy the content without ...', 'word_count': 804, 'code_blocks': 44}


In [8]:
# src/build_vectorstore.py
"""
Task 2: Text Chunking, Embedding, and Vector Store Creation

Objective:
Convert Markdown documents into semantically meaningful chunks,
embed them, and store them for fast retrieval.
"""

import os
import re
from pathlib import Path
from typing import List

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document





# ==============================
# Configuration
# ==============================
MARKDOWN_DIR = Path("data/markdown_docs")
VECTORSTORE_DIR = Path("vectorstore")
VECTORSTORE_DIR.mkdir(exist_ok=True)

CHUNK_SIZE = 400
CHUNK_OVERLAP = 80

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


In [9]:


# ==============================
# Helper Functions
# ==============================


def contains_code_block(text: str) -> bool:
    """Check whether a chunk contains a fenced code block."""
    return bool(re.search(r"```[\s\S]*?```", text))




def extract_sections_with_headings(text: str):
    """
    Split markdown into (section_heading, section_text) pairs.
    If no heading exists, section_heading is None.
    """
    sections = re.split(r"(?=^#{1,6}\s)", text, flags=re.MULTILINE)
    results = []


    for sec in sections:
        sec = sec.strip()
        if not sec:
            continue


        lines = sec.splitlines()
        if lines[0].startswith("#"):
            heading = lines[0].lstrip("#").strip()
            body = "\n".join(lines[1:]).strip()
        else:
            heading = None
            body = sec


        results.append((heading, body))


    return results

In [10]:
# ==============================
# Load Markdown Documents
# ==============================
from typing import List


documents: List[Document] = []

for md_file in MARKDOWN_DIR.glob("*.md"):
    loader = TextLoader(str(md_file), encoding="utf-8")
    loaded_docs = loader.load()

    for doc in loaded_docs:
        sections = extract_sections_with_headings(doc.page_content)

        for heading, section_text in sections:
            if not section_text.strip():
                continue

            documents.append(
                Document(
                    page_content=section_text,
                    metadata={
                        "source_file": md_file.name,
                        "section_heading": heading,
                    },
                )
            )

print(f"Loaded {len(documents)} section-level documents")


Loaded 214 section-level documents


In [11]:
# ==============================
# Chunking Strategy
# ==============================
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
)

chunked_documents: List[Document] = []
for doc in documents:
    chunks = text_splitter.split_text(doc.page_content)
    for chunk in chunks:
        chunked_documents.append(
            Document(
                page_content=chunk,
                metadata={
                    **doc.metadata,
                    "contains_code": contains_code_block(chunk),
                },
            )
        )

print(f"Created {len(chunked_documents)} chunks")

Created 520 chunks


In [12]:
# ==============================
# Embeddings
# ==============================
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


# ==============================
# Vector Store (FAISS)
# ==============================
vectorstore = FAISS.from_documents(
documents=chunked_documents,
embedding=embeddings,
)


vectorstore.save_local(str(VECTORSTORE_DIR))


print("Vector store successfully saved to 'vectorstore/'")


# ==============================
# Notes
# ==============================
# The saved vector store contains:
# - Embedded chunks
# - Metadata (source_file, section_heading, contains_code)
# - Original chunk text
# Ready for fast semantic retrieval in a RAG pipeline

  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


Vector store successfully saved to 'vectorstore/'


In [13]:
# src/rag_pipeline.py
"""
Task 3: RAG Core Logic (Retrieval + Generation)
Objective:
Retrieve relevant Markdown chunks and generate answers strictly
from the provided context.
"""


import os
from pathlib import Path
from typing import List, Tuple

from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate
from transformers import pipeline


# ==============================
# Configuration
# ==============================
VECTORSTORE_DIR = Path("vectorstore")
TOP_K = 5


EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "google/flan-t5-small" # Streamlit-friendly


# ==============================
# Prompt Template (STRICT RAG)
# ==============================
PROMPT_TEMPLATE = """
You are an educational assistant for web development students.

Your task is to answer questions about Bootstrap 5.

Answering rules (VERY IMPORTANT):
1. First, search for the answer using reliable general knowledge about Bootstrap 5.
2. Then, check the provided Context.
3. If the Context contains relevant information, prioritize and align your answer with it.
4. Do NOT contradict the Context.
5. Do NOT guess or invent information.
6. If neither outside knowledge nor the Context provides a clear answer, respond exactly with:
   "I cannot find this information in the provided materials."

Formatting & style rules:
- Explain concepts in simple, student-friendly language.
- Use Markdown formatting.
- When code is relevant, include it inside proper code blocks.
- When explaining HTML or Bootstrap classes, clearly describe what each part does.

Context:
{context}

Question:
{question}

Answer:

"""


prompt = PromptTemplate(
input_variables=["context", "question"],
template=PROMPT_TEMPLATE,
)

In [14]:
# ==============================
# Load Vector Store
# ==============================
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


vectorstore = FAISS.load_local(
str(VECTORSTORE_DIR),
embeddings,
allow_dangerous_deserialization=True,
)


retriever = vectorstore.as_retriever(search_kwargs={"k": TOP_K})

In [15]:
# ==============================
# Load Lightweight LLM
# ==============================
text2text_pipeline = pipeline(
"text2text-generation",
model=LLM_MODEL_NAME,
max_length=512,
)


llm = HuggingFacePipeline(pipeline=text2text_pipeline)

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=text2text_pipeline)


In [16]:
# ==============================
# RAG Answer Function
# ==============================




def answer_question(question: str) -> Tuple[str, List[Document]]:
    """
    Retrieve relevant chunks and generate an answer strictly from context.

    Returns:
    - answer text (str)
    - retrieved source documents (List[Document])
    """

    docs = retriever.invoke(question)


    context = "\n\n".join(
        f"Source: {d.metadata.get('source_file')} | "
        f"Section: {d.metadata.get('section_heading')}\n"
        f"{d.page_content}"
        for d in docs
    )

    final_prompt = prompt.format(
        context=context,
        question=question
    )

    answer = llm.invoke(final_prompt)


    return answer, docs


In [17]:


# ==============================
# Example Usage (Manual Test)
# ==============================
if __name__ == "__main__":
    sample_question = "What does navbar mean in card?"
    response, sources = answer_question(sample_question)


    print("Answer:\n", response)
    print("\nRetrieved Sources:")
    for s in sources:
        print("-", s.metadata)

Token indices sequence length is longer than the specified maximum sequence length for this model (561 > 512). Running this sequence through the model will result in indexing errors


Answer:
 A responsive navigation header used for **site branding and links**.

Retrieved Sources:
- {'source_file': 'boot2.md', 'section_heading': 'What is a Navbar?', 'contains_code': False}
- {'source_file': 'note.md', 'section_heading': 'EXAM PREP', 'contains_code': False}
- {'source_file': 'note.md', 'section_heading': 'EXAM PREP', 'contains_code': False}
- {'source_file': 'note.md', 'section_heading': 'EXAM PREP', 'contains_code': True}
- {'source_file': 'note.md', 'section_heading': 'EXAM PREP', 'contains_code': False}
