# Embed and Chunk

Because I am going to be taking a RAG approach, this notebook will be a helper notebook to embed and chunk the data scraped from the internet 

In [1]:
import os
from pathlib import Path
import pickle
from typing import Dict, List, Any

import numpy as np
from dotenv import load_dotenv
import openai

## Setup

In [2]:
# Resolve key paths relative to this notebook
NOTEBOOK_DIR = Path().resolve()
DATA_DIR = (NOTEBOOK_DIR / "../data/scraped_documents").resolve()
VECTOR_STORE_DIR = (NOTEBOOK_DIR / "../data/vector_store").resolve()
VECTOR_STORE_PATH = VECTOR_STORE_DIR / "vector_store.pkl"

print(f"Data directory: {DATA_DIR}")
print(f"Vector store directory: {VECTOR_STORE_DIR}")


Data directory: /Users/georgiaray_ic/Documents/coding/law_comparisons/phase_2/data/scraped_documents
Vector store directory: /Users/georgiaray_ic/Documents/coding/law_comparisons/phase_2/data/vector_store


In [3]:
# Load credentials and instantiate the OpenRouter client
load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("Please set OPENROUTER_API_KEY before running embeddings.")

client = openai.OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

EMBEDDING_MODEL = "text-embedding-3-large"
print("OpenRouter client configured.")


OpenRouter client configured.


## Chunking Utilities

In [4]:

def chunk_text(text: str, chunk_size: int = 200, chunk_overlap: int = 75) -> List[str]:
    """Split a document into overlapping word chunks of fixed size."""
    words = text.split()
    if not words:
        return []

    chunks: List[str] = []
    start = 0
    total_words = len(words)

    while start < total_words:
        end = min(total_words, start + chunk_size)
        chunk = " ".join(words[start:end]).strip()
        if chunk:
            chunks.append(chunk)
        if end >= total_words:
            break
        # Decide new start: handle overlap
        start += max(1, chunk_size - chunk_overlap)

    return chunks


def load_documents(data_dir: Path) -> Dict[str, str]:
    """Read every .txt file in the input directory."""
    documents: Dict[str, str] = {}
    for path in sorted(data_dir.glob("*.txt")):
        documents[path.stem] = path.read_text(encoding="utf-8")
    return documents


documents = load_documents(DATA_DIR)
print(f"Loaded {len(documents)} documents from {DATA_DIR}.")


Loaded 157 documents from /Users/georgiaray_ic/Documents/coding/law_comparisons/phase_2/data/scraped_documents.


In [5]:
from typing import Optional
import re

def trim_non_content(text: str) -> str:
    """
    Attempts to trim repeated navigation/boilerplate 'chrome' text
    like headers, navbars, footers, menus, and non-content at the head and tail of the doc.
    Now also attempts to aggressively remove 'media contact' and long government footer sections.
    Not guaranteed to catch everything, but tries to remove common patterns.
    """
    # Common boilerplate phrases likely indicating start/end of content
    head_patterns = [
        r"^(?:.*Canada\.ca.*\n){1,5}",  # Canada.ca spam header
        r"^Skip to main content[\n\r]+", 
        r"^Skip to [^\n]+\n", 
        r"^Language selection\n", 
        r"^(?:Français|fr|Gouvernement du Canada)[\n/ ]+",
        r"^Search[^\n]*\n", 
        r"^Menu\n", 
        r"^Main\n", 
        r"^[\w \-/]+\nJobs and the workplace\n",  # menu bar spam
        r"^(?:[\w ,/&-]+\n){3,10}You are here:[^\n]*\n",  # Common 'menu' preamble
        r"^From:[^\n]+\n News release[\n]*",  # News release preamble
    ]

    # Extended tail patterns to catch press/media contact and keyword megamenus
    tail_patterns = [
        r"\nReport a problem or mistake on this page.*$", 
        r"\nDate modified:[^\n]*$", 
        r"\n(?:Footer|End of Document|Contact us)[^\n]*$", 
        r"\nThis page was last updated.*$",
        # Pattern for typical "For media:" contact/info blocks
        r"\nFor media:[\s\S]+?(?=\n\S)",  # Stop at next headline, non-indented line
        # Pattern for "Media Relations" sections to end of text or next major block
        r"\nMedia Relations[\s\S]+?(?=\n\S|\Z)",
        # Any block of repeated contact/institution/string-ending info separated by newlines at the end
        r"(?:\n[\w \-.,/:\(\)@]+){6,}[\s\n]*$",  # If 6+ consecutive lines of mostly names, contacts, orgs, likely a tail
        # Remove "Search for related information by keyword: ..." (plus likely following menu/keyword lines)
        r"\nSearch for related information by keyword:[\s\S]+?(?=\n\S|\Z)",
        # Remove "Page details", "About this site", "Government of Canada", mega-menu block at tail
        r"\nPage details[\s\S]+?(?=\n\S|\Z)",
        r"\nAbout this site[\s\S]+?(?=\n\S|\Z)",
        r"\nGovernment of Canada[\s\S]+?(?=\n\S|\Z)",
        r"\nAll contacts[\s\S]+?(?=\n\S|\Z)",
    ]

    cleaned = text.strip()

    # Heuristically trim head
    for pat in head_patterns:
        cleaned_new = re.sub(pat, '', cleaned, flags=re.IGNORECASE|re.MULTILINE)
        if len(cleaned_new) < len(cleaned) - 8:  # Actually shortened content?
            cleaned = cleaned_new
            break

    # Heuristically trim tail
    trimmed = False
    for pat in tail_patterns:
        cleaned_new = re.sub(pat, '', cleaned, flags=re.IGNORECASE|re.MULTILINE)
        if len(cleaned_new) < len(cleaned) - 8:
            cleaned = cleaned_new
            trimmed = True  # Remove multiple from tail
    # Try tail removal twice (to catch two-stage footers)
    if trimmed:
        for pat in tail_patterns:
            cleaned_new = re.sub(pat, '', cleaned, flags=re.IGNORECASE|re.MULTILINE)
            if len(cleaned_new) < len(cleaned) - 8:
                cleaned = cleaned_new

    # Secondary: For repeated menu/footer junk, try to cut on keyword
    NON_CONTENT_KEYWORDS = [
        "You are here:",
        "Main Menu",
        "Search Canada.ca",
        "Back to top",
        "Date modified:", 
        "Report a problem or mistake on this page",
        "Contact us",
        "Page details",
        "About this site",
        "Government of Canada",
        "All contacts",
        "Departments and agencies",
        "Benefits",
        "Social media",
        "Privacy",
        "Terms and conditions",
        "Mobile applications",
        "Themes and topics",
        "Follow us on",
    ]
    # Remove lines at top or bottom containing only these keywords
    lines = cleaned.splitlines()
    # Remove leading non-content lines
    while lines and any(k.lower() in lines[0].lower() for k in NON_CONTENT_KEYWORDS):
        lines = lines[1:]
    # Remove trailing non-content lines
    while lines and any(k.lower() in lines[-1].lower() for k in NON_CONTENT_KEYWORDS):
        lines = lines[:-1]

    # Remove trailing contact blocks that start with a name/email/phone/office-style sequence
    # Heuristic: 4+ lines at end with email/phone/org/office or a lot of short lines
    tail_lines = lines[-12:]
    for i in range(len(tail_lines)):
        slice_ = tail_lines[i:]
        # Must be 4+ trailing "contact-ish" lines
        if (len(slice_) >= 4 and
            sum(bool(re.search(r"\b(?:@|[0-9]{3}-[0-9]{3}-[0-9]{4}|\([0-9]{3}\)|Director|Media|Office|Relations|Ottawa|Canada|Minister)", l, re.IGNORECASE)) for l in slice_) >= 2):
            lines = lines[:-len(slice_)]
            break

    return "\n".join(lines).strip()


In [6]:
for doc in documents:
    documents[doc] = trim_non_content(documents[doc])


In [7]:
chunk_records: List[Dict[str, Any]] = []
for doc_name, text in documents.items():
    chunks = chunk_text(text)
    if not chunks:
        continue
    for idx, chunk in enumerate(chunks):
        chunk_records.append(
            {
                "doc_name": doc_name,
                "chunk_index": idx,
                "text": chunk,
            }
        )

print(f"Prepared {len(chunk_records)} chunks across all documents.")


Prepared 3504 chunks across all documents.


## Generate and Persist Embeddings

In [8]:
def embed_chunk_batch(batch: List[Dict[str, Any]]) -> List[List[float]]:
    inputs = [item["text"] for item in batch]
    response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=inputs
    )
    # OpenAI returns embeddings in the same order as inputs
    return [item.embedding for item in response.data]


if chunk_records:
    VECTOR_STORE_DIR.mkdir(parents=True, exist_ok=True)

    enriched_records: List[Dict[str, Any]] = []
    batch_size = 64
    for start in range(0, len(chunk_records), batch_size):
        batch = chunk_records[start:start + batch_size]
        embeddings = embed_chunk_batch(batch)
        for record, embedding in zip(batch, embeddings):
            enriched = {**record, "embedding": embedding}
            enriched_records.append(enriched)
        print(f"Embedded {len(enriched_records)} / {len(chunk_records)} chunks", end="\r")

    # Organize per-document and persist
    vector_store: Dict[str, Dict[str, Any]] = {}
    for record in enriched_records:
        doc_entry = vector_store.setdefault(
            record["doc_name"],
            {"embeddings": [], "chunks": []}
        )
        doc_entry["embeddings"].append(record["embedding"])
        doc_entry["chunks"].append(
            {
                "chunk_index": record["chunk_index"],
                "text": record["text"],
            }
        )

    for doc_name, doc_entry in vector_store.items():
        doc_entry["embeddings"] = np.array(doc_entry["embeddings"], dtype=np.float32)
        doc_entry["chunks"] = doc_entry["chunks"]

    with open(VECTOR_STORE_PATH, "wb") as f:
        pickle.dump(vector_store, f)

    print(f"\nPersisted vector store to {VECTOR_STORE_PATH}")
else:
    print("No chunks were prepared; vector store was not created.")


Embedded 3504 / 3504 chunks
Persisted vector store to /Users/georgiaray_ic/Documents/coding/law_comparisons/phase_2/data/vector_store/vector_store.pkl


## Query Helpers

These are also put into a utils file

In [9]:
def load_vector_store(path: Path = VECTOR_STORE_PATH) -> Dict[str, Dict[str, Any]]:
    if not path.exists():
        raise FileNotFoundError(f"Vector store not found at {path}. Run the embedding cell first.")
    with open(path, "rb") as f:
        store = pickle.load(f)
    # Ensure embeddings are numpy arrays after unpickling
    for entry in store.values():
        entry["embeddings"] = np.array(entry["embeddings"], dtype=np.float32)
    return store


def query_document(store: Dict[str, Dict[str, Any]], doc_name: str, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
    doc_entry = store.get(doc_name)
    if doc_entry is None:
        available = ", ".join(sorted(store.keys()))
        raise KeyError(f"Document '{doc_name}' not in vector store. Available names: {available}")

    query_embedding = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=[query]
    ).data[0].embedding

    doc_embeddings = doc_entry["embeddings"]
    if not len(doc_embeddings):
        return []

    query_vector = np.array(query_embedding, dtype=np.float32)
    query_norm = np.linalg.norm(query_vector)
    doc_norms = np.linalg.norm(doc_embeddings, axis=1)
    similarities = (doc_embeddings @ query_vector) / (doc_norms * query_norm + 1e-8)

    top_indices = similarities.argsort()[-top_k:][::-1]

    results: List[Dict[str, Any]] = []
    for idx in top_indices:
        chunk_meta = doc_entry["chunks"][idx]
        results.append(
            {
                "chunk_index": chunk_meta["chunk_index"],
                "similarity": float(similarities[idx]),
                "text": chunk_meta["text"],
            }
        )
    return results


In [10]:
total_chunks = sum(len(entry["chunks"]) for entry in load_vector_store().values())
print(f"Total chunks in vector store: {total_chunks}")


Total chunks in vector store: 3504


In [11]:
vector_store = load_vector_store()
chunk_counts = {doc_name: len(entry["chunks"]) for doc_name, entry in vector_store.items()}
print("Chunks per document:")
for doc_name, count in chunk_counts.items():
    print(f"  {doc_name}: {count}")


Chunks per document:
  0617QI04qmv1n2Qn: 4
  0LXiNB86ogEwokMr: 10
  0N7wzqLvYrInYD4C: 5
  0PKKl2mM9DOjt1r4: 2
  0bi9qnKBpE62Y3EJ: 1
  0uB7sxk9PSmykFPu: 4
  1NX3gRkASaf3c5mB: 33
  1clGKa1PXAUtLydt: 18
  1r3BJDFsjbXU0R3K: 23
  2XUgWVRI3YcBdecj: 1
  2qn97s0DnXYe2qH8: 143
  3AHfTSNelYuhb8eq: 180
  3D79OgkYjzMU4hqM: 4
  3Oahxp3pihpVDQVt: 44
  3yfCv6EhX8efYU8n: 7
  3zDkQRaHMFJvpzUc: 7
  4F1rb0vmsfkNshQg: 4
  5LbcbYLQjNkpiCvQ: 5
  5c1S2JtfVqit1gYP: 19
  5yexWjdtsBHm7RRA: 54
  8Ui54HFehYuBl7ur: 21
  9aurfECiBPdsexq2: 4
  9dY0wMyM1zlVcjSy: 6
  AQZ39lgN6Bw0OC7r: 5
  ByPueS8fHhoOogr3: 17
  C4tJdLHQMrOXAxJX: 2
  CC1KXU2vAqsknG1X: 4
  Cq8KBARs7aXaVrtK: 1
  DAum9XBl9BxXQdTP: 53
  Di8ayhFMxRTWRYEK: 11
  Dq7tj4kSnARqzlg5: 2
  DueoEkCNSrdjAkzn: 7
  Ei6uJoE9W1xLKVWu: 1
  EkqrNJW7QqSowArI: 1
  FFJbyT8KajMmpbwR: 5
  FfzBsvXzD9fkY2WQ: 2
  G6RmGheTDI0EhzXd: 1
  GQ7BHgpfBUyYnAjp: 3
  Gvrfapf3xwftHMMZ: 1
  Hjlnxxej10vaDRFs: 5
  HntYeSkM5RJIhfdb: 1
  I5KJZ47MUEKMk6d2: 61
  I8kbRgdLIwbUzZA5: 5
  IYZ5hi7VuVPGC1L

In [12]:
import pandas as pd
mapper = pd.read_csv("../data/unique_id_to_name.csv")
unique_id_to_name = dict(zip(mapper["unique_id"], mapper["name"]))

# Check if all the documents are represented 
for doc_name in documents.keys():
    if doc_name not in vector_store:
        readable_name = unique_id_to_name.get(doc_name, "(unknown name)")
        print(f"Document {doc_name} not represented in vector store (name: {readable_name})")


Document awpyR3n5mUyiQX8e not represented in vector store (name: net-zero challenge)


## Example Usage

In [13]:
if VECTOR_STORE_PATH.exists():
    store = load_vector_store()
    sample_doc = next(iter(store.keys()))
    sample_results = query_document(
        store,
        doc_name=sample_doc,
        query="information about market failures",
        top_k=3,
    )
    print(f"Top chunks for document: {sample_doc}")
    for result in sample_results:
        print("-" * 40)
        print(f"Chunk #{result['chunk_index']} (similarity {result['similarity']:.3f})")
        print(result["text"])
else:
    print("Vector store not found yet. Run the embedding cell above first.")


Top chunks for document: 0617QI04qmv1n2Qn
----------------------------------------
Chunk #1 (similarity 0.231)
to help Canadian onshore oil and gas companies invest in green solutions and infrastructure to continue their progress toward reducing methane emissions while facing the COVID-19 pandemic. The final intake closed on March 31, 2022, and funding eligibility closed by March 31, 2023. Results to date The ERF Onshore Program funded 24 companies, to implement 91 projects across Manitoba, Saskatchewan, Alberta and British Columbia, representing $170M in repayable and partially repayable NRCan funding. These projects are expected to cut an anticipated 4 MT of CO 2 e in their first year after completion. As a part of program administration, the ERF Onshore Program collects data from completed projects and reports on that on an ongoing basis. In addition to the data in the three tables, below, the program has compiled three separate reports. Additionality Report – evaluates the extent t