In [1]:
%load_ext autoreload
%autoreload 2

# Setup

In [2]:
from config.utils import setup_logging, get_logger
from config.config import ConfigManager

setup_logging(level="INFO")
logger = get_logger(__name__)
logger.info("Starting the application...")
config = ConfigManager()

2025-05-14 22:52:44,944 - __main__ - INFO - Starting the application...
2025-05-14 22:52:44,947 - config.config - INFO - Attempting to load configuration from config.json
2025-05-14 22:52:44,950 - config.config - INFO - Configuration loaded successfully.
2025-05-14 22:52:44,955 - config.config - INFO - Configuration validated.


# Data collection

**fetches the data from the sources identified in config**

In [3]:
from data_collection.collect_data import run_initial_fetch

run_initial_fetch()

2025-05-14 22:53:08,818 - data_collection.collect_data - INFO - Loading configuration...
2025-05-14 22:53:08,820 - config.config - INFO - Attempting to load configuration from config.json
2025-05-14 22:53:08,822 - config.config - INFO - Configuration loaded successfully.
2025-05-14 22:53:08,823 - config.config - INFO - Configuration validated.
2025-05-14 22:53:08,823 - data_collection.collect_data - INFO - Configuration loaded. Log file set to: documentation.log
2025-05-14 22:53:08,824 - data_collection.collect_data - INFO - Initialization complete. Starting data fetch.
2025-05-14 22:53:08,825 - data_collection.fetcher - INFO - Documentation storage path: /Users/eliza/search-pydantic-ai/retrieved_data/documentation_raw
2025-05-14 22:53:08,826 - data_collection.collect_data - INFO - Fetching 1 documentation sources
2025-05-14 22:53:08,826 - data_collection.fetcher - INFO - Starting documentation fetch process for 1 URLs...
2025-05-14 22:53:08,827 - data_collection.fetcher - INFO - Attem

# Chunking and vectorization

**updates the vector db incrementally**

OpenAI or HF embeddings can be chosen in config.json.
Defaults to OpenAI

In [4]:
from vectorizing_and_retrieval.create_vectors import (
    update_vector_db_for_all_urls,
)
update_vector_db_for_all_urls(config.config)

2025-05-14 22:54:39,422 - vectorizing_and_retrieval.create_vectors - INFO - Processing 1 unique documentation URLs from config.
2025-05-14 22:54:39,423 - vectorizing_and_retrieval.create_vectors - INFO - Starting vector DB update for URL: https://ai.pydantic.dev/
2025-05-14 22:54:39,424 - vectorizing_and_retrieval.create_vectors - INFO - Vector store index path: /Users/eliza/search-pydantic-ai/retrieved_data/vector_store/ai_pydantic_dev_index
2025-05-14 22:54:40,746 - vectorizing_and_retrieval.create_vectors - INFO - Initializing OpenAI Embeddings (Model: text-embedding-3-small)
2025-05-14 22:54:40,817 - vectorizing_and_retrieval.create_vectors - INFO - Attempting to load existing vector store from: /Users/eliza/search-pydantic-ai/retrieved_data/vector_store/ai_pydantic_dev_index
2025-05-14 22:54:40,821 - faiss.loader - INFO - Loading faiss with AVX512 support.
2025-05-14 22:54:40,822 - faiss.loader - INFO - Could not load library with AVX512 support due to:
ModuleNotFoundError("No mod

# Create a simple graph overlay

Just to connect nearby chunks. But I hope to connect documents also based on symbols used and potentially connect them to source code

In [5]:
from vectorizing_and_retrieval.create_graphs import create_graph_for_all_urls

create_graph_for_all_urls(config.config)

2025-05-14 22:54:41,337 - vectorizing_and_retrieval.create_graphs - INFO - Processing 1 unique documentation URLs for graph creation.
2025-05-14 22:54:41,338 - vectorizing_and_retrieval.create_graphs - INFO - Processing files in source directory: /Users/eliza/search-pydantic-ai/retrieved_data/documentation_raw/ai.pydantic.dev_index
2025-05-14 22:54:41,339 - vectorizing_and_retrieval.create_vectors - INFO - Initializing OpenAI Embeddings (Model: text-embedding-3-small)
2025-05-14 22:54:41,363 - vectorizing_and_retrieval.create_vectors - INFO - Attempting to load existing vector store from: /Users/eliza/search-pydantic-ai/retrieved_data/vector_store/ai_pydantic_dev_index
Total edges created: 37986
Total edges created: 37986
2025-05-14 22:54:41,595 - vectorizing_and_retrieval.create_graphs - INFO - Serializing graph with 276 nodes and 37986 edges to /Users/eliza/search-pydantic-ai/retrieved_data/graphs/ai_pydantic_dev_index.json


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


# Done