In [65]:
%pip install --upgrade pip

%pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Extract and chunk from blog posts

In [66]:
import frontmatter, os, json

In [67]:
def extract_chunks_from_markdown(md_path, normalize_chunk_text=lambda x: x):
    """
    Extracts chunks from a Markdown file, splitting by headings and normalizing text.
    Args:
        md_path (str): Path to the Markdown file.
        normalize_chunk_text (function): Function to normalize chunk text.
    Returns:
        list: A list of dictionaries, each containing the title, heading, chunk text, tags, category, and path.
    """
    post = frontmatter.load(md_path)
    title = post.get("title", os.path.basename(md_path))
    tags = post.get("tags", [])
    category = post.get("category", "")
    lines = post.content.splitlines()

    chunks = []
    current_chunk = []
    current_heading = "Introduction"

    for line in lines:
        if line.startswith("##"):
            if current_chunk:
                chunks.append(
                    {
                        "title": title,
                        "heading": current_heading,
                        "chunk": "\n".join(current_chunk).strip(),
                        "tags": tags,
                        "category": category,
                        "path": md_path,
                    }
                )
                current_chunk = []
            current_heading = line.strip("# ").strip()
        else:
            current_chunk.append(line)

    # Final chunk
    if current_chunk:
        chunk_text = normalize_chunk_text("\n".join(current_chunk))
        chunks.append(
            {
                "title": title,
                "heading": current_heading,
                "chunk":chunk_text,
                # "chunk": "\n".join(current_chunk).strip(),
                "tags": tags,
                "category": category,
                "path": md_path,
            }
        )

    return chunks

In [68]:
import json


def get_chunk_writer(filename):
    f = open(filename, "w", encoding="utf-8")  # or "a" for append mode

    def write_chunk(chunk):
        f.write(json.dumps(chunk) + "\n")

    return write_chunk, f.close

In [69]:
import re


def normalize_chunk_text(text):
    text = text.strip()
    text = re.sub(r"\n{2,}", "\n\n", text)  # collapse 3+ newlines to 2
    text = re.sub(r"[ \t]+", " ", text)  # normalize whitespace
    return text

In [70]:
write_chunk, close_writer = get_chunk_writer("chunks_raw.jsonl")

# all_chunks = []

for file in os.listdir("../_posts"):
    if file.endswith(".md"):
        full_path = os.path.join("../_posts", file)
        print(f"Processing file: {file}")
        chunks = extract_chunks_from_markdown(full_path, normalize_chunk_text=normalize_chunk_text)
        for chunk in chunks:
            write_chunk(chunk)  # Write each chunk to the file
        # all_chunks.extend(chunks)

close_writer()

Processing file: 2024-12-12-osx-tree.md
Processing file: 2025-03-28-rules-vrs-reality.md
Processing file: 2025-03-28-assertions-vs-validation.md
Processing file: 2024-04-12-enhancing-focus-with-thematic-sprints-in-our-dynamic-development-team.md
Processing file: __2010-02-05-post-quote.md
Processing file: 2023-02-23-replacing-objectid-with-a-string.md
Processing file: 2025-04-07-rag-at-scale-is-hard-what-startups-get-wrong.md
Processing file: 2025-06-01-what-climbing-has-taught-me-about-dev-teams.md
Processing file: __2010-03-07-post-link.md
Processing file: 2025-04-23-postman-reusable-scripts.md
Processing file: __2010-02-05-post-notice.md
Processing file: 2025-04-02-lexicon.md
Processing file: 2024-05-10-npm-install.md
Processing file: 2024-01-06-azure-appinsights-with-nodejs-adding-operation-id-to-response.md
Processing file: 2025-03-12-web-hygiene.md
Processing file: 2023-10-17-postman-and-faker.md
Processing file: 2024-11-24-azure-appinsights-query-logs.md
Processing file: 2024-04