In [1]:
import os

# Process the data files
1. if you havent git cloned the files do so now.
2. we will only grab the markdowns fiiles in the simple example. 
3. insert the process data into the database 
4. createa a vectorizer to process the data into embeddings. this is all handled by the pgai system
5. Profit. Query the database in SQl
6. brief python example


### Lets grab all the markdown files in the directory and return a list. 

In [2]:
def find_markdown_files(directory):
    markdown_files = []
    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file has a .md extension
            if file.endswith('.md'):
                # Append the full path to the list
                markdown_files.append(os.path.join(root, file))
    return markdown_files

#### Lets quickly print the files to make sure we are good.

In [13]:
# Example usage
directory_path = 'pydantic-ai'
markdown_files = find_markdown_files(directory_path)
print(markdown_files)


['pydantic-ai/README.md', 'pydantic-ai/pydantic_ai_slim/README.md', 'pydantic-ai/tests/example_modules/README.md', 'pydantic-ai/docs/troubleshooting.md', 'pydantic-ai/docs/dependencies.md', 'pydantic-ai/docs/install.md', 'pydantic-ai/docs/help.md', 'pydantic-ai/docs/message-history.md', 'pydantic-ai/docs/testing-evals.md', 'pydantic-ai/docs/multi-agent-applications.md', 'pydantic-ai/docs/results.md', 'pydantic-ai/docs/index.md', 'pydantic-ai/docs/models.md', 'pydantic-ai/docs/contributing.md', 'pydantic-ai/docs/agents.md', 'pydantic-ai/docs/logfire.md', 'pydantic-ai/docs/graph.md', 'pydantic-ai/docs/tools.md', 'pydantic-ai/docs/examples/rag.md', 'pydantic-ai/docs/examples/bank-support.md', 'pydantic-ai/docs/examples/flight-booking.md', 'pydantic-ai/docs/examples/stream-whales.md', 'pydantic-ai/docs/examples/sql-gen.md', 'pydantic-ai/docs/examples/pydantic-model.md', 'pydantic-ai/docs/examples/chat-app.md', 'pydantic-ai/docs/examples/question-graph.md', 'pydantic-ai/docs/examples/index.

In [14]:
# pretty print the markdown file names
for file in markdown_files:
    print(file)

pydantic-ai/README.md
pydantic-ai/pydantic_ai_slim/README.md
pydantic-ai/tests/example_modules/README.md
pydantic-ai/docs/troubleshooting.md
pydantic-ai/docs/dependencies.md
pydantic-ai/docs/install.md
pydantic-ai/docs/help.md
pydantic-ai/docs/message-history.md
pydantic-ai/docs/testing-evals.md
pydantic-ai/docs/multi-agent-applications.md
pydantic-ai/docs/results.md
pydantic-ai/docs/index.md
pydantic-ai/docs/models.md
pydantic-ai/docs/contributing.md
pydantic-ai/docs/agents.md
pydantic-ai/docs/logfire.md
pydantic-ai/docs/graph.md
pydantic-ai/docs/tools.md
pydantic-ai/docs/examples/rag.md
pydantic-ai/docs/examples/bank-support.md
pydantic-ai/docs/examples/flight-booking.md
pydantic-ai/docs/examples/stream-whales.md
pydantic-ai/docs/examples/sql-gen.md
pydantic-ai/docs/examples/pydantic-model.md
pydantic-ai/docs/examples/chat-app.md
pydantic-ai/docs/examples/question-graph.md
pydantic-ai/docs/examples/index.md
pydantic-ai/docs/examples/stream-markdown.md
pydantic-ai/docs/examples/weathe

### Lets open up one fo the files
- as we can see we are successful grabing the pydantic-ai documentation


In [15]:
#print the contents of the first file
with open(markdown_files[0], 'r') as file:
    print(file.read())

<div align="center">
  <a href="https://ai.pydantic.dev/">
    <picture>
      <source media="(prefers-color-scheme: dark)" srcset="https://ai.pydantic.dev/img/pydantic-ai-dark.svg">
      <img src="https://ai.pydantic.dev/img/pydantic-ai-light.svg" alt="PydanticAI">
    </picture>
  </a>
</div>
<div align="center">
  <em>Agent Framework / shim to use Pydantic with LLMs</em>
</div>
<div align="center">
  <a href="https://github.com/pydantic/pydantic-ai/actions/workflows/ci.yml?query=branch%3Amain"><img src="https://github.com/pydantic/pydantic-ai/actions/workflows/ci.yml/badge.svg?event=push" alt="CI"></a>
  <a href="https://coverage-badge.samuelcolvin.workers.dev/redirect/pydantic/pydantic-ai"><img src="https://coverage-badge.samuelcolvin.workers.dev/pydantic/pydantic-ai.svg" alt="Coverage"></a>
  <a href="https://pypi.python.org/pypi/pydantic-ai"><img src="https://img.shields.io/pypi/v/pydantic-ai.svg" alt="PyPI"></a>
  <a href="https://github.com/pydantic/pydantic-ai"><img src="http

# Next up Getting the Markdown files into the the database
This solution will:
- Split each markdown file into chunks based on headers
- Preserve metadata about the source file and section
- Create a table with the necessary columns
- Set up a vectorizer to automatically embed the content
- Insert all the chunks into the database
- The vectorizer will automatically process the chunks and create embeddings that you can use for semantic search.
Remember to adjust the database connection parameters according to your setup. Also, make sure you have the pgai extension and Ollama running as described in your vectorizer quick start guide.

In [16]:
import re
from typing import List, Dict

def chunk_markdown_file(file_path: str) -> List[Dict]:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by headers (## or #)
    chunks = re.split(r'(?=^#{1,2}\s)', content, flags=re.MULTILINE)
    
    processed_chunks = []
    for chunk in chunks:
        if chunk.strip():  # Skip empty chunks
            # Extract header if it exists
            header_match = re.match(r'^#{1,2}\s+(.+)$', chunk.split('\n')[0])
            header = header_match.group(1) if header_match else "No Header"
            
            processed_chunks.append({
                'source_file': file_path,
                'title': header,
                'content': chunk.strip(),
                'metadata': {
                    'file_path': file_path,
                    'section_title': header
                }
            })
    
    return processed_chunks



In [17]:
# Test on the first markdown file
test_file = markdown_files[10]
print(f"Testing file: {test_file}\n")

# Get chunks for the test file
test_chunks = chunk_markdown_file(test_file)

# Print each chunk in a readable format
for i, chunk in enumerate(test_chunks, 1):
    print(f"\n=== Chunk {i} ===")
    print(f"Title: {chunk['title']}")
    print(f"Source: {chunk['source_file']}")
    print(f"\nContent Preview (first 768 chars):")
    print(chunk['content'][:768], "...\n")
    print("Metadata:", chunk['metadata'])
    print("=" * 50)

Testing file: pydantic-ai/docs/results.md


=== Chunk 1 ===
Title: No Header
Source: pydantic-ai/docs/results.md

Content Preview (first 768 chars):
Results are the final values returned from [running an agent](agents.md#running-agents).
The result values are wrapped in [`RunResult`][pydantic_ai.result.RunResult] and [`StreamedRunResult`][pydantic_ai.result.StreamedRunResult] so you can access other data like [usage][pydantic_ai.usage.Usage] of the run and [message history](message-history.md#accessing-messages-from-results)

Both `RunResult` and `StreamedRunResult` are generic in the data they wrap, so typing information about the data returned by the agent is preserved.

```python {title="olympics.py"}
from pydantic import BaseModel

from pydantic_ai import Agent


class CityLocation(BaseModel):
    city: str
    country: str


agent = Agent('gemini-1.5-flash', result_type=CityLocation)
result = agent. ...

Metadata: {'file_path': 'pydantic-ai/docs/results.md', 'section_title': 'No H

### Thast good enough for this example DB. lets process and chunk up all of the files and then put them into a database tabel that we created earlier.

In [18]:
# Process all markdown files
all_chunks = []
for file_path in markdown_files:
    chunks = chunk_markdown_file(file_path)
    all_chunks.extend(chunks)

In [None]:
print(all_chunks)

###  we will insert this into the database using psycopg2 but you can run a normal sql statement int he terminal as well.

In [20]:
import psycopg2
import json
from psycopg2.extras import execute_values

# Database connection parameters
db_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'postgres',
    'host': 'localhost',
    'port': '5432'
}

# Connect to the database
conn = psycopg2.connect(**db_params)
cur = conn.cursor()

# Prepare the data for insertion
values = [
    (
        chunk['source_file'],
        chunk['title'],
        chunk['content'],
        json.dumps(chunk['metadata'])
    )
    for chunk in all_chunks
]

# Insert the data
execute_values(
    cur,
    """
    INSERT INTO documentation (source_file, title, content, metadata)
    VALUES %s
    """,
    values
)

# Commit and close
conn.commit()
cur.close()
conn.close()

### Lets check the database to make sure our data was added.
- we can improve the chunkign and the storage

In [None]:
# ## Why use PydanticAI

# * __Built by the Pydantic Team__
# Built by the team behind [Pydantic](https://docs.pydantic.dev/latest/) (the validation layer of the OpenAI SDK, the Anthropic SDK, LangChain, LlamaIndex, AutoGPT, Transformers, CrewAI, Instructor and many more).

# * __Model-agnostic__
# Supports OpenAI, Anthropic, Gemini, Ollama, Groq, and Mistral, and there is a simple interface to implement support for [other models](https://ai.pydantic.dev/models/).

# * __Pydantic Logfire Integration__
# Seamlessly [integrates](https://ai.pydantic.dev/logfire/) with [Pydantic Logfire](https://pydantic.dev/logfire) for real-time debugging, performance monitoring, and behavior tracking of your LLM-powered applications.

# * __Type-safe__
# Designed to make [type checking](https://ai.pydantic.dev/agents/#static-type-checking) as powerful and informative as possible for you.

# * __Python-centric Design__
# Leverages Python's familiar control flow and agent composition to build your AI-driven projects, making it easy to apply standard Python best practices you'd use in any other (non-AI) project.

# * __Structured Responses__
# Harnesses the power of [Pydantic](https://docs.pydantic.dev/latest/) to [validate and structure](https://ai.pydantic.dev/results/#structured-result-validation) model outputs, ensuring responses are consistent across runs.

# * __Dependency Injection System__
# Offers an optional [dependency injection](https://ai.pydantic.dev/dependencies/) system to provide data and services to your agent's [system prompts](https://ai.pydantic.dev/agents/#system-prompts), [tools](https://ai.pydantic.dev/tools/) and [result validators](https://ai.pydantic.dev/results/#result-validators-functions).
# This is useful for testing and eval-driven iterative development.

# * __Streamed Responses__
# Provides the ability to [stream](https://ai.pydantic.dev/results/#streamed-results) LLM outputs continuously, with immediate validation, ensuring rapid and accurate results.

# * __Graph Support__
# [Pydantic Graph](https://ai.pydantic.dev/graph) provides a powerful way to define graphs using typing hints, this is useful in complex applications where standard control flow can degrade to spaghetti code.

## now run  03_vectorizer in the db to create a new vectorizer.

Now run a Semantic Search in SQL

In [None]:
SELECT
    content,
    embedding <=> ai.ollama_embed('nomic-embed-text', 'what is an agent?', host => 'http://ollama:11434') as distance
FROM documentation_embeddings
ORDER BY distance
LIMIT 5;

### Lets create a function to run the SQL semantic search in python

In [24]:
# Database connection parameters
db_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'postgres',
    'host': 'localhost',
    'port': '5432'
}

In [28]:
def fetch_search_results(search_text):
    # Reconnect to the database
    conn = psycopg2.connect(**db_params)
    cur = conn.cursor()

    # Define the query with a placeholder
    query = """
    SELECT
        content,
        embedding <=> ai.ollama_embed('nomic-embed-text', %s, host => 'http://ollama:11434') as distance
    FROM documentation_embeddings
    ORDER BY distance
    LIMIT 1;
    """

    try:
        # Execute the query with the search_text variable
        cur.execute(query, (search_text,))
        results = cur.fetchall()
        
        # Print results in markdown format
        for row in results:
            print(f"## Search Result (Distance: {row[1]:.4f})\n")
            # add markdown formatting
            print(f"{row[0]}\n")
            print("---\n")

    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        # Always close cursor and connection
        cur.close()
        conn.close()




In [29]:
query = 'how to install pydantic'

fetch_search_results(query)

## Search Result (Distance: 0.1857)

# Installation

PydanticAI is available on PyPI as [`pydantic-ai`](https://pypi.org/project/pydantic-ai/) so installation is as simple as:

```bash
pip/uv-add pydantic-ai
```

(Requires Python 3.9+)

This installs the `pydantic_ai` package, core dependencies, and libraries required to use all the models
included in PydanticAI. If you want to use a specific model, you can install the ["slim"](#slim-install) version of PydanticAI.

---



# Bonus Section

## Lets take a look at the new pydantic ai library and make a simple rag agent.
1. import the necessary libs
2. Connect to the ollama model running in the container
3. Create a new agent
4. 

In [21]:
from pydantic_ai import Agent
from pydantic_ai.models.ollama import OllamaModel

import nest_asyncio
nest_asyncio.apply()

In [22]:
ollama_model = OllamaModel(
    model_name='llama',  
    base_url='http://ollama:11434',  
)


In [30]:
agent = Agent(ollama_model, 
              system_prompt=(
                  "You are an expert on building applications with pydantic-ai library"
                  "Answer questions by providing code and suggestions."
                  {query}
              )
)

In [None]:
@agent.tool
def search_tool(query: str):
    """
    Tool to perform a semantic search using the provided query.
    """
    fetch_search_results(query)  
