In [1]:
import os
import re
from pathlib import Path
from pprint import pprint
from typing import Any, List, TypedDict
from urllib.parse import urlparse

import requests
from dotenv import load_dotenv
from minsearch import AppendableIndex, Index
from openai import OpenAI
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.messages import FunctionToolCallEvent

In [2]:
CREDS_PATH = Path("pydantic/.env")

In [3]:
load_dotenv(CREDS_PATH)
api_key = os.getenv("OPENAI_API_KEY")

In [4]:
openai_client = OpenAI(api_key=api_key)

## Iteration 1: have agent repeatedly call 
* `search_documents`

#### Tool 1: `fetch_data`

In [5]:
#!wget https://r.jina.ai/en.wikipedia.org/wiki/Capybara -O Capybara_markdown.md

In [6]:
# with open('Capybara_markdown.md', 'r', encoding='utf-8') as f:
#   content = f.read()

In [7]:
def fetch_data(filepath: str) -> str:
    """Agent will use this tool to read data from a local file"""
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"Error reading file: {e}"

In [8]:
fetched_data = fetch_data("Capybara_markdown.md")
pprint(fetched_data)

('Title: Capybara\n'
 '\n'
 'URL Source: http://en.wikipedia.org/wiki/Capybara\n'
 '\n'
 'Published Time: 2001-10-13T20:04:37Z\n'
 '\n'
 'Markdown Content:\n'
 '[![Image 1: This is a good article. Click here for more '
 'information.](http://upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/20px-Symbol_support_vote.svg.png)](http://en.wikipedia.org/wiki/Wikipedia:Good_articles* '
 '"This is a good article. Click here for more information.")\n'
 '\n'
 '[![Image 2: Page '
 'semi-protected](http://upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png)](http://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi '
 '"This article is semi-protected until March 17, 2028 at 19:23 UTC.")\n'
 '\n'
 'From Wikipedia, the free encyclopedia\n'
 '\n'
 '| Capybara |\n'
 '| --- |\n'
 '| [![Image '
 '3](http://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Hydrochoeris_hydrochaeris_in_Brazil_in_Petr%C3%B3polis%2C_Rio_de_Jan

* extract only data (without also extracting additional information from individual urls)

In [9]:
def clean_markdown(content: str) -> str:
    """
    Remove obvious junk while keeping content intact.
    Then join paragraphs and remove separators.
    """
    # Remove header
    if "Markdown Content:" in content:
        content = content.split("Markdown Content:\n", 1)[1]

    # Remove references section
    if "\nReferences\n" in content:
        content = content.split("\nReferences\n")[0]
    if "\nExternal links\n" in content:
        content = content.split("\nExternal links\n")[0]

    # Extract text from links - keep the text, lose the URL
    content = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", content)

    # Remove images
    content = re.sub(r"!\[[^\]]*\]\([^)]+\)", "", content)

    # Remove citation markers like [[1]] or [[a]]
    content = re.sub(r"\[\[[^\]]+\]\]", "", content)

    # Remove URLs but ONLY if they're standalone
    content = re.sub(r"https?://[^\s]+", "", content)

    # Remove section separators like '---------\n' or '-----\n'
    content = re.sub(r"-{5,}\n", "\n", content)

    # Replace newlines with spaces to join text together
    content = re.sub(r"\n+", " ", content)

    # Clean up noise artifacts
    content = re.sub(r"\s*\(\s*\(\s*", " ", content)  # Remove (( with spaces
    content = re.sub(r'\s*\(".*?"\)', "", content)  # Remove ("text")
    content = re.sub(r"\.jpg\)", "", content)  # Remove .jpg) files
    content = re.sub(
        r"(_on_|_alpha_)\w+\.(JPG|jpg)\)", "", content
    )  # Remove image files

    # Clean up extra spaces
    content = re.sub(r" {2,}", " ", content).strip()

    return content

In [None]:
def chunk_text(
    text: str, chunk_size: int = 1000, chunk_overlap: int = 200
) -> list[dict]:
    """
    Split text into overlapping chunks and return as list of dicts for indexing.

    Args:
        text: The text to chunk
        chunk_size: Characters per chunk
        chunk_overlap: Overlap between chunks

    Returns:
        List of dicts with 'content' field for indexing
    """
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end]

        chunks.append(
            {"content": chunk_text, "start": start, "end": min(end, len(text))}
        )

        if end >= len(text):
            break

        start += chunk_size - chunk_overlap  # Move forward with overlap

    return chunks


# Clean and chunk
cleaned_text = clean_markdown(fetched_data)
document_chunks = chunk_text(cleaned_text, chunk_size=1000, chunk_overlap=200)

print(f"Created {len(document_chunks)} chunks")
print("\nFirst chunk preview (first 200 chars):")
pprint(document_chunks[0]["content"][:200])

Created 18 chunks

First chunk preview (first 200 chars):
('From Wikipedia, the free encyclopedia | Capybara | | --- | | | | In '
 'Petrópolis, Brazil | | Conservation status | | !Image 4 Least Concern(IUCN '
 '3.1)( | | Scientific classification "Taxonomy (biology)")')


In [None]:
index = Index(
    text_fields=["content"]  # Only one field since we're indexing chunks
)

index.fit(document_chunks)

print("✅ Index created and fitted with chunks!")

✅ Index created and fitted with chunks!


In [12]:
# Test the index with a search
test_results = index.search("capybara diet", num_results=3)

print(f"Found {len(test_results)} results for 'capybara diet'\n")
for i, result in enumerate(test_results, 1):
    print(
        f"--- Result {i} (chars {result.get('start', '?')}-{result.get('end', '?')}) ---"
    )
    print(result.get("content", "")[:300] + "...\n")

Found 3 results for 'capybara diet'

--- Result 1 (chars 6400-7400) ---
 back-and-forth rather than side-to-side.( Capybaras are autocoprophagous,( meaning they eat their own feces as a source of bacterial gut flora, to help digest the cellulose in the grass that forms their normal diet, and to extract the maximum protein and vitamins from their food. They also regurgit...

--- Result 2 (chars 4800-5800) ---
 along rivers in the tropical rainforest. They are superb swimmers and can hold their breath underwater for up to five minutes at a time. Capybara have flourished in cattle ranches. They roam in home ranges averaging 10 hectares (25 acres) in high-density populations.( Many escapees from captivity c...

--- Result 3 (chars 5600-6600) ---
n North America may actually fill the ecological niche of the Pleistocene species.( ### Diet and predation A capybara eating hay at Franklin Park Zoo, Boston, Massachusetts Capybaras are herbivores, grazing mainly on grasses and aquatic plants,( as

👇🏻 refactor to a class

In [13]:
class SearchResult(TypedDict):
    """Represents a single search result entry for Capybara article."""

    start: int
    content: str
    end: int


def search_documents(query: str) -> List[SearchResult]:
    """
    Search the index for document chunks matching the given query.

    Args:
        query (str): The search query string (e.g., "capybara habitat", "diet")

    Returns:
        List[SearchResult]: A list of search results with start, content, and end positions
    """
    return index.search(
        query=query,
        num_results=5,
    )

## Instantiate Agent
* calls `search_documents`

#### system prompt

In [14]:
instructions = """
You are a summary agent for Wikipedia articles. Your goal is to create a comprehensive, well-structured summary of the Capybara article using the indexed document chunks. You will retrieve relevant information through targeted searches.

Summary creation process:

Phase 1 - Exploration (3-5 broad searches):
   - Start with a broad query about capybara to understand the overall topic
   - Identify major themes and sections (e.g., taxonomy, habitat, behavior, diet, reproduction, conservation)
   - Use general searches to map out the content structure
   
Phase 2 - Deep retrieval (8-12 specific searches):
   - For each major topic identified in phase 1, perform 1-2 focused searches with specific queries
   - Examples: "capybara habitat South America", "capybara diet grasses", "capybara reproduction mating"
   - Retrieve chunks covering different aspects of each topic
   - Total goal: ~11-17 searches overall
   
Phase 3 - Synthesis:
   - Combine information from multiple search results
   - Organize into logical sections
   - Create a coherent, informative summary

Final deliverable:

Produce a structured summary as valid JSON following the Article model schema.
The summary must be comprehensive, accurate, and well-organized.

Rules:

Search queries:
   - Use specific, targeted queries (e.g., "capybara habitat", "capybara diet", "capybara behavior")
   - Focus on retrieving chunks that cover different aspects of the article
   - Each search should help build a comprehensive understanding

Summary structure:
   - Structure: Create an Article object with introduction, sections, and conclusion
   - Introduction: A single string containing 2-3 paragraphs (4-6 sentences each) providing context
   - Main sections (8-12 topics): taxonomy, physical description, habitat, behavior, diet, reproduction, conservation, etc.
     - Each section has: title (string) and paragraphs (list of 2-4 paragraph strings)
     - Each paragraph should be 4-6 sentences
   - Conclusion: A single string containing 2-3 paragraphs synthesizing key points

Content quality:
   - All information must be based on retrieved chunks from the indexed document
   - Reference the source chunks that support each claim
   - Do not fabricate information
   - Maintain accuracy and coherence

Structure and flow:
   - Sections should progress logically (intro → taxonomy → description → behavior → etc.)
   - Use clear transitions between sections
   - Maintain a natural narrative flow

Output format:
   - Produce valid JSON following the Article model schema
   - Structure: Article(introduction=str, sections=[ArticleSection(title, paragraphs)], conclusion=str)
   - Each section's paragraphs should be a list of strings, not a single multi-paragraph string
   - Do not include markdown formatting
   - Organize sections clearly with unique titles

Quality checks:
   - Ensure comprehensive coverage of main topics
   - Verify all paragraphs are informative and relevant
   - Confirm proper section organization
   - Ensure conclusion effectively synthesizes key points

Only output once all checks are satisfied.
""".strip()

#### event handler as callback

In [15]:
async def print_function_calls(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):  # this is the main part
        print("TOOL CALL:", event.part.tool_name, event.part.args)

#### enforce structured output

In [16]:
class ArticleSection(BaseModel):
    """A section of the article with title and paragraphs"""

    title: str = Field(
        description="Title of the section (e.g., 'Taxonomy', 'Habitat', 'Diet')"
    )
    paragraphs: List[str] = Field(
        description="List of paragraphs in this section (4-6 sentences each)"
    )


class Article(BaseModel):
    """Complete article structure for the summary"""

    introduction: str = Field(
        description="Introduction paragraph(s) providing context (2-3 paragraphs)"
    )
    sections: List[ArticleSection] = Field(
        description="Main sections of the article (8-12 sections)"
    )
    conclusion: str = Field(
        description="Conclusion paragraph(s) synthesizing key points (2-3 paragraphs)"
    )

In [17]:
agent = Agent(
    name="summarizer",
    instructions=instructions,
    tools=[search_documents],
    model="gpt-4o-mini",
    output_type=Article,
)

In [18]:
results = await agent.run(
    "What is this page about? https://en.wikipedia.org/wiki/Capybara",
    event_stream_handler=print_function_calls,
)

TOOL CALL: search_documents {"query":"capybara overview"}
TOOL CALL: search_documents {"query": "capybara taxonomy classification"}
TOOL CALL: search_documents {"query": "capybara physical description"}
TOOL CALL: search_documents {"query": "capybara habitat and distribution"}
TOOL CALL: search_documents {"query": "capybara social behavior"}
TOOL CALL: search_documents {"query": "capybara diet feeding habits"}
TOOL CALL: search_documents {"query": "capybara reproduction mating"}
TOOL CALL: search_documents {"query": "capybara conservation status threats"}
TOOL CALL: search_documents {"query": "capybara interactions with humans"}
TOOL CALL: search_documents {"query": "capybara cultural significance and media"}
TOOL CALL: search_documents {"query": "capybara genetics and evolution"}


In [19]:
print(results.output)

introduction='The capybara (_Hydrochoerus hydrochaeris_) is the largest living rodent, native to South America, and belongs to the genus _Hydrochoerus_. These semi-aquatic mammals are closely related to guinea pigs and rock cavies, and they share a broader connection with species like the agouti and the nutria. Recognized for their sociable nature, capybaras typically inhabit savannas and dense forests, often in proximity to water bodies, where they are commonly found in groups that can range in size from a few individuals to larger gatherings of up to a hundred. Popularly referred to as the "giant capybara," they play significant ecological roles in their habitats and are also of interest in human interaction and culture.' sections=[ArticleSection(title='Taxonomy and Classification', paragraphs=['Capybaras belong to the kingdom Animalia, phylum Chordata, class Mammalia, order Rodentia, family Caviidae, and the genus _Hydrochoerus_. There are only two recognized living species in the g

### Conclusion
* introduction: single string with multiple paragraphs ✅
* sections: List[ArticleSection] where each has:
    * title: string ✅
    * paragraphs: list of paragraph strings ✅
    * conclusion: single string with multiple paragraphs ✅

# Iteration 2: Multi-Article Support with Agent-Controlled Tools

In this iteration, we convert the system to allow the agent to dynamically fetch, clean, chunk, and index multiple articles.

## Key Changes:
- Convert `Index` → `AppendableIndex` for incremental indexing
- Make `fetch_data` an agent tool
- Add `clean_and_chunk_data` tool
- Add `index_data` tool  
- Agent controls full workflow (fetch → clean → chunk → index → search)

Let's get started!


#### Reuse Existing Functions
- `fetch_data()` - already works with URLs
- `clean_markdown()` - already implemented
- `chunk_text()` - already implemented


In [20]:
URLS = [
    "https://en.wikipedia.org/wiki/Lesser_capybara",
    "https://en.wikipedia.org/wiki/Hydrochoerus",
    "https://en.wikipedia.org/wiki/Neochoerus",
    "https://en.wikipedia.org/wiki/Caviodon",
    "https://en.wikipedia.org/wiki/Neochoerus_aesopi",
]

In [None]:
def _fetch_single_url(url: str) -> dict[str, str]:
    """
    Helper function to fetch a single Wikipedia article as markdown and save locally.

    Args:
        url: URL to fetch (automatically converts Wikipedia URLs to Jina AI format)

    Returns:
        Dict with single key-value pair: {filename: content}
        - filename: The local .md filename where content was saved (e.g., "Capybara.md")
        - content: The raw markdown content of the article

    Example:
        >>> result = _fetch_single_url("https://en.wikipedia.org/wiki/Capybara")
        >>> # Saves to "Capybara.md" and returns {"Capybara.md": "Title: Capybara\\n..."}
    """
    try:
        if "en.wikipedia.org" in url and "r.jina.ai" not in url:
            jina_url = url.replace("en.wikipedia.org", "r.jina.ai/en.wikipedia.org")
        else:
            jina_url = url

        path = urlparse(jina_url).path
        filename = path.split("/")[-1] + ".md"
        if not filename or filename == ".md":
            filename = "article.md"

        response = requests.get(jina_url, timeout=30)
        response.raise_for_status()
        content = response.text

        with open(filename, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"✅ Fetched and saved: {filename}")
        return {filename: content}
    except Exception as e:
        print(f"❌ Error fetching {url}: {e}")
        return {url: f"Error: {e}"}

In [22]:
# test_fetch_one = _fetch_single_url("https://en.wikipedia.org/wiki/Capybara")
# test_fetch_one

In [23]:
def fetch_multiple_articles(urls_list: list[str]) -> dict[str, str]:
    """
    Fetch multiple Wikipedia articles as markdown files and save them locally.

    This function is designed to download and save multiple articles at once,
    making them available for the agent to process and index.

    Args:
        urls_list: List of Wikipedia article URLs to fetch

    Returns:
        Dict mapping filename to content: {filename: content}
        - Keys are .md filenames (e.g., "Capybara.md", "Penguin.md")
        - Values are the raw markdown content of each article

    Example:
        >>> urls = [
        ...     "https://en.wikipedia.org/wiki/Capybara",
        ...     "https://en.wikipedia.org/wiki/Penguin"
        ... ]
        >>> results = fetch_multiple_articles(urls)
        >>> # Returns: {"Capybara.md": "Title: Capybara...", "Penguin.md": "Title: Penguin..."}
        >>> # Files saved: Capybara.md, Penguin.md

    Note:
        This function is typically called before indexing articles for search.
        The agent uses this to fetch articles that it will later process with
        clean_and_chunk_data() and index_data().
    """
    results: dict[str, str] = {}
    for url in urls_list:
        results.update(_fetch_single_url(url))

    return results

In [24]:
# test_fetch_five = fetch_multiple_articles(URLS)
# test_fetch_five

In [None]:
def clean_and_chunk_data(content: str) -> dict:
    """
    Clean markdown content and chunk it for indexing.

    Args:
        content: Raw markdown content from fetch_data
    Returns:
        Dictionary with:
        - 'cleaned_text': The cleaned text
        - 'chunks': List of chunk dicts with content, start, end
        - 'chunk_count': Number of chunks created
    """
    cleaned_text = clean_markdown(content)

    chunks = chunk_text(cleaned_text, chunk_size=1000, chunk_overlap=200)

    return {"cleaned_text": cleaned_text, "chunks": chunks, "chunk_count": len(chunks)}

In [26]:
index_v2 = AppendableIndex(text_fields=["content"])


def index_data(chunks: list) -> str:
    """
    Add chunks to the search index.

    Call this after clean_and_chunk_data to make content searchable.

    Args:
        chunks: List of chunk dictionaries from clean_and_chunk_data
    Returns:
        Confirmation message with number of chunks indexed
    """
    index_v2.add(chunks)
    return f"✅ Indexed {len(chunks)} chunks"

In [27]:
def search_documents_v2(query: str) -> List[SearchResult]:
    """
    Search the index for document chunks matching the given query.

    Args:
        query (str): The search query string (e.g., "capybara habitat", "diet")

    Returns:
        List[SearchResult]: A list of search results with start, content, and end positions
    """
    return index_v2.search(
        query=query,
        num_results=5,
    )

## Instantiate Agent for Iteration 2


In [28]:
instructions = """
You are a summary agent for Wikipedia articles. Your goal is to create a comprehensive, well-structured summary of the Capybara article using the indexed document chunks. You will retrieve relevant information through targeted searches.

Summary creation process:

Phase 1 - Exploration (3-5 broad searches):
   - Start with a broad query about capybara to understand the overall topic
   - Identify major themes and sections (e.g., taxonomy, habitat, behavior, diet, reproduction, conservation)
   - Use general searches to map out the content structure
   
Phase 2 - Deep retrieval (8-12 specific searches):
   - For each major topic identified in phase 1, perform 1-2 focused searches with specific queries
   - Examples: "capybara habitat South America", "capybara diet grasses", "capybara reproduction mating"
   - Retrieve chunks covering different aspects of each topic
   - Total goal: ~11-17 searches overall
   
Phase 3 - Synthesis:
   - Combine information from multiple search results
   - Organize into logical sections
   - Create a coherent, informative summary

Final deliverable:

Produce a structured summary as valid JSON following the Article model schema.
The summary must be comprehensive, accurate, and well-organized.

Rules:

Search queries:
   - Use specific, targeted queries (e.g., "capybara habitat", "capybara diet", "capybara behavior")
   - Focus on retrieving chunks that cover different aspects of the article
   - Each search should help build a comprehensive understanding

Summary structure:
   - Structure: Create an Article object with introduction, sections, and conclusion
   - Introduction: A single string containing 2-3 paragraphs (4-6 sentences each) providing context
   - Main sections (8-12 topics): taxonomy, physical description, habitat, behavior, diet, reproduction, conservation, etc.
     - Each section has: title (string) and paragraphs (list of 2-4 paragraph strings)
     - Each paragraph should be 4-6 sentences
   - Conclusion: A single string containing 2-3 paragraphs synthesizing key points

Content quality:
   - All information must be based on retrieved chunks from the indexed document
   - Reference the source chunks that support each claim
   - Do not fabricate information
   - Maintain accuracy and coherence

Structure and flow:
   - Sections should progress logically (intro → taxonomy → description → behavior → etc.)
   - Use clear transitions between sections
   - Maintain a natural narrative flow

Output format:
   - Produce valid JSON following the Article model schema
   - Structure: Article(introduction=str, sections=[ArticleSection(title, paragraphs)], conclusion=str)
   - Each section's paragraphs should be a list of strings, not a single multi-paragraph string
   - Do not include markdown formatting
   - Organize sections clearly with unique titles

Quality checks:
   - Ensure comprehensive coverage of main topics
   - Verify all paragraphs are informative and relevant
   - Confirm proper section organization
   - Ensure conclusion effectively synthesizes key points

**Critical Rule**: Before generating or finalizing the summary, you must have called :
1. `fetch_multiple_articles` - to download all articles
2. `clean_and_chunk_data` - to process **each article**
3. `index_data` - to **add each article to the search index**
4. `search_documents_v2` - to retrieve information from the indexed articles

Do not attempt to generate the summary without first completing this workflow. 
Verify you have performed multiple searches across the indexed content before synthesizing your response.

Only output once all checks are satisfied.
""".strip()

In [None]:
agent_v2 = Agent(
    name="multi_article_summarizer",
    instructions=instructions,
    tools=[fetch_data, clean_and_chunk_data, index_data, search_documents_v2],
    model="gpt-4o-mini",
    output_type=Article,
)

In [None]:
# Event handler for iteration 2
async def print_function_calls_v2(ctx: Any, event: Any) -> None:
    """Print function calls for debugging."""
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls_v2(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print(f"🔧 Tool: {event.part.tool_name}")
        print(f"   Args: {event.part.args}")
        print()

In [31]:
results = await agent.run(
    "What are threats to capybara populations?",
    event_stream_handler=print_function_calls_v2,
)

🔧 Tool: search_documents
   Args: {"query":"capybara threats to population"}

🔧 Tool: search_documents
   Args: {"query":"capybara conservation threats"}

🔧 Tool: search_documents
   Args: {"query":"capybara population decline causes"}

🔧 Tool: search_documents
   Args: {"query":"capybara population decline hunting habitat loss"}

🔧 Tool: search_documents
   Args: {"query":"capybara population decline habitat destruction"}

🔧 Tool: search_documents
   Args: {"query":"capybara hunting and habitat loss effects"}

🔧 Tool: search_documents
   Args: {"query":"capybara conservation hunting habitat destruction"}

🔧 Tool: search_documents
   Args: {"query":"capybara population decline hunting effects habitat destruction"}

🔧 Tool: search_documents
   Args: {"query":"capybara hunting conservation habitat loss impact"}

🔧 Tool: search_documents
   Args: {"query":"capybara threats hunting habitat destruction loss"}

🔧 Tool: search_documents
   Args: {"query":"capybara threats hunting habitat dest

In [32]:
pprint(results.output)

Article(introduction="The capybara (_Hydrochoerus hydrochaeris_) is the world's largest rodent, native to South America. Known for its social nature, the capybara inhabits a variety of ecosystems including savannas, forests, and wetlands, often found in groups near water sources. While not currently classified as threatened, various pressures exist that affect their populations, including hunting and habitat destruction.", sections=[ArticleSection(title='Taxonomy', paragraphs=['The capybara belongs to the family Caviidae, which includes other rodents such as guinea pigs. It is a member of the genus _Hydrochoerus_ and its species name is _H. hydrochaeris_. Capybaras are closely related to rock cavies and more distantly related to chinchillas and agoutis, illustrating their position in the rodent evolutionary tree.', "The lineage of capybaras extends back to the Neogene period, with several fossil species discovered that indicate their long evolutionary history. The classification of ext

### Conclusions
* Strengths:
 - 10 sections with diverse topics
 - Accurate details (coprophagy, group sizes, wallowing)
 - Clear structure and flow
- Covers requested topics (e.g., threats)

* Issues:
- Introduction too short
- Should be 2–3 paragraphs (4–6 sentences each)
- Currently 1 paragraph (~3 sentences)
- Paragraphs too short in places
- Some are 1–2 sentences; target 4–6
- "Conclusion" as a section
    - Move this content to the conclusion field; sections should cover article topics, not conclude
- Query vs output
    - Query focused on threats, but output is a full article summary
- Option: focus the sections on threats, or add an explicit “threats” section and adjust the prompt
    - Suggestions:
    * Expand the introduction to 2–3 paragraphs
    * Lengthen paragraphs to 4–6 sentences
    * Move conclusion content to the conclusion field
    * Decide whether to constrain the output to the query or adjust the prompt to reflect full-article scope
* Overall, the structure and content are strong; the main gaps are paragraph length and placement of concluding material.

## Validate model's reasoning

In [None]:
def verify_chunk_content(chunk: dict, original_file: str) -> bool:
    """
    Verify that chunk content actually exists in the original markdown file.

    Args:
        chunk: Chunk dict with start, end, content, and metadata
        original_file: Path to the .md file to check

    Returns:
        True if content matches, False otherwise
    """
    # Read the original file
    with open(original_file, "r", encoding="utf-8") as f:
        full_content = f.read()

    # Clean the content to match what was chunked
    cleaned_full = clean_markdown(full_content)

    # Extract the text at the claimed positions
    start = chunk["start"]
    end = chunk["end"]
    actual_snippet = cleaned_full[start:end]

    # Check if it matches
    return actual_snippet.strip() == chunk["content"].strip()


# Usage example:
chunk = {
    "content": "The capybara...",
    "start": 100,
    "end": 1000,
    "chunk_id": "Capybara_100_1000",
}

is_valid = verify_chunk_content(chunk, "Capybara.md")
print(f"Chunk verified: {is_valid}")

## For better code maintainability (pending)

In [None]:
def chunk_text(
    text: str,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    article_title: str = None,
    source_url: str = None,
    published_time: str = None,
) -> list[dict]:
    """
    Split text into overlapping chunks with metadata for QA.

    Args:
        text: The text to chunk
        chunk_size: Characters per chunk
        chunk_overlap: Overlap between chunks
        article_title: Title of the article (for QA)
        source_url: Original URL of the article (for QA)
        published_time: Publication timestamp (for QA)

    Returns:
        List of dicts with content, position, and metadata fields
    """
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunk_text = text[start:end]

        # Create unique chunk ID
        chunk_id = f"{article_title or 'article'}_{start}_{end}"

        chunk = {
            "content": chunk_text,
            "start": start,
            "end": min(end, len(text)),
            "chunk_id": chunk_id,
        }

        # Add metadata if provided
        if article_title:
            chunk["article_title"] = article_title
        if source_url:
            chunk["source_url"] = source_url
        if published_time:
            chunk["published_time"] = published_time

        chunks.append(chunk)

        if end >= len(text):
            break

        start += chunk_size - chunk_overlap

    return chunks

In [None]:
def clean_and_chunk_data(content: str, article_metadata: dict = None) -> dict:
    """
    Clean markdown content and chunk it for indexing with metadata.

    Args:
        content: Raw markdown content from fetch_data
        article_metadata: Dict with article info (title, url, time) for QA
            Example: {"title": "Capybara", "url": "http://en.wikipedia.org/...",
                     "published_time": "2001-10-13T20:04:37Z"}

    Returns:
        Dictionary with:
        - 'cleaned_text': The cleaned text
        - 'chunks': List of chunk dicts with content, position, and metadata
        - 'chunk_count': Number of chunks created
    """
    # Clean the content
    cleaned_text = clean_markdown(content)

    # Extract metadata if available
    metadata = {}
    if article_metadata:
        metadata = {
            "article_title": article_metadata.get("title"),
            "source_url": article_metadata.get("url"),
            "published_time": article_metadata.get("published_time"),
        }

    # Chunk the text with metadata
    chunks = chunk_text(cleaned_text, chunk_size=1000, chunk_overlap=200, **metadata)

    return {"cleaned_text": cleaned_text, "chunks": chunks, "chunk_count": len(chunks)}