gopaytech · clavinjune · Dec 2, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/knowledge server/.gitignore b/knowledge server/.gitignore
@@ -0,0 +1 @@
+config.yaml
diff --git a/knowledge server/.python-version b/knowledge server/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/knowledge server/Makefile b/knowledge server/Makefile
@@ -0,0 +1,15 @@
+.PHONY: check format lint type-check run
+
+check: lint format type-check
+
+run:
+	uv run main.py
+
+lint:
+	uvx ruff check --fix
+
+format:
+	uvx ruff format
+
+type-check:
+	uvx ty check
diff --git a/knowledge server/README.md b/knowledge server/README.md
@@ -0,0 +1,168 @@
+# Knowledge Server
+
+A Model Context Protocol (MCP) server that provides AI agents with access to a knowledge base built from PDF and Markdown documents using RAG (Retrieval-Augmented Generation).
+
+## Features
+
+- **Document Loading**: Supports PDF and Markdown files from local directories
+- **Vector Storage**: Uses Milvus for efficient vector similarity search with full-text search support
+- **Embeddings**: Configurable embeddings via Ollama
+- **Text Chunking**: Recursive character text splitting with configurable chunk size and overlap
+- **MCP Integration**: Exposes knowledge base queries through FastMCP server
+- **Flexible Configuration**: YAML-based configuration for easy customization
+
+## Architecture
+
+```
+┌─────────────┐      ┌──────────────┐      ┌────────────┐
+│  Datasource │─────▶│    Loader    │─────▶│  Splitter  │
+│   (YAML)    │      │ (PDF/MD)     │      │            │
+└─────────────┘      └──────────────┘      └────────────┘
+                                                   │
+                                                   ▼
+┌─────────────┐      ┌──────────────┐      ┌────────────┐
+│  MCP Client │◀─────│  MCP Server  │◀─────│   Milvus   │
+│  (AI Agent) │      │  (FastMCP)   │      │  (Vector)  │
+└─────────────┘      └──────────────┘      └────────────┘
+```
+
+## Installation
+
+1. Ensure Python 3.12+ is installed
+2. Install dependencies using uv:
+   ```bash
+   uv sync
+   ```
+
+## Configuration
+
+### 1. Create Configuration File
+
+Copy the example configuration:
+```bash
+cp config.example.yaml config.yaml
+```
+
+Edit `config.yaml`:
+```yaml
+log_level: DEBUG
+vector_store:
+  type: milvus
+  url: "http://localhost:19530"
+  collection_name: knowledge_base
+  reset_collection: true
+  enable_full_text_search: true
+chunk_size: 1000
+chunk_overlap: 200
+embeddings:
+  provider: ollama
+  model: nomic-embed-text
+```
+
+### 2. Configure Data Sources
+
+Create `datasource.yaml`:
+```yaml
+datasource:
+  - type: directory
+    path: ../datasets/
+```
+
+### 3. Start Milvus
+
+Using Docker Compose:
+```bash
+docker-compose up -d
+```
+
+This will start Milvus on `http://localhost:19530`.
+
+## Usage
+
+### Running the Server
+
+```bash
+uv run python main.py
+```
+
+The server will:
+1. Load documents from configured datasources
+2. Split documents into chunks
+3. Generate embeddings using Ollama
+4. Store vectors in Milvus
+5. Start the MCP server on streamable-http transport
+
+### Querying the Knowledge Base
+
+The server exposes an MCP tool `query_knowledge_base`:
+
+```python
+query_knowledge_base(
+    query: str,      # Search query
+    top_k: int = 4   # Number of results to return
+) -> list[str]
+```
+
+## Project Structure
+
+```
+.
+├── config/
+│   └── config.py           # Configuration loader
+├── loader/
+│   ├── datasource.py       # Datasource abstraction
+│   └── directory.py        # Directory loader (PDF/MD)
+├── model/
+│   ├── factory.py          # Embeddings factory
+│   └── model_garden.py     # Model configurations
+├── vector_store/
+│   └── milvus.py          # Milvus vector store implementation
+├── main.py                 # Application entry point
+├── config.yaml             # Runtime configuration
+├── datasource.yaml         # Data source definitions
+└── pyproject.toml         # Project dependencies
+```
+
+## Dependencies
+
+- **langchain-community**: Document loaders and utilities
+- **langchain-ollama**: Ollama embeddings integration
+- **mcp**: Model Context Protocol server
+- **pymilvus**: Milvus vector database client
+- **pypdf**: PDF parsing
+- **pyyaml**: YAML configuration parsing
+
+## Development
+
+### Code Style
+
+Format code using Ruff:
+```bash
+uv run ruff format .
+uv run ruff check .
+```
+
+### Type Checking
+
+Type checking is configured with `ty` (ignored rules in `pyproject.toml`).
+
+## Troubleshooting
+
+### Import Errors
+
+If you encounter `ImportError: cannot import name 'Blob'`, ensure you're using the correct import:
+```python
+from langchain_community.document_loaders.blob_loaders import Blob
+```
+
+### Milvus Connection Issues
+
+Verify Milvus is running:
+```bash
+docker-compose ps
+```
+
+Check Milvus logs:
+```bash
+docker-compose logs milvus-standalone
+```
diff --git a/knowledge server/config.example.yaml b/knowledge server/config.example.yaml
@@ -0,0 +1,9 @@
+log_level: DEBUG
+vector_store:
+  type: milvus
+  url: "http://localhost:19530"
+  collection_name: knowledge_base
+  reset_collection: true
+  enable_full_text_search: true
+chunk_size: 1000
+chunk_overlap: 200
diff --git a/knowledge server/config/config.py b/knowledge server/config/config.py
@@ -0,0 +1,61 @@
+import yaml
+
+
+def load_config(filepath):
+    with open(filepath, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+
+
+class Config:
+    vector_store: "VectorStoreConfig"
+    log_level: str
+    chunk_size: int
+    chunk_overlap: int
+    embeddings: "EmbeddingsConfig"
+
+    def __init__(self, filepath):
+        config = load_config(filepath)
+        if config is None:
+            raise ValueError("Failed to load configuration.")
+        self.vector_store = VectorStoreConfig(config)
+        self.log_level = config.get("log_level", "INFO")
+        self.chunk_size = config.get("chunk_size", 1000)
+        self.chunk_overlap = config.get("chunk_overlap", 200)
+        self.embeddings = EmbeddingsConfig(config)
+
+
+class EmbeddingsConfig:
+    source: str
+    model: str
+
+    def __init__(self, config: dict):
+        embeddings_config = config.get("embeddings", None)
+        if embeddings_config is None:
+            raise ValueError("Embeddings configuration is missing in the config file.")
+
+        self.source = embeddings_config.get("source", None)
+        self.model = embeddings_config.get("model", None)
+
+
+class VectorStoreConfig:
+    type: str
+    url: str
+    collection_name: str
+    reset_collection: bool
+    enable_full_text_search: bool
+
+    def __init__(self, config: dict):
+        vector_store_config = config.get("vector_store", None)
+        if vector_store_config is None:
+            raise ValueError(
+                "Vector store configuration is missing in the config file."
+            )
+
+        self.type = vector_store_config.get("type", None)
+        self.url = vector_store_config.get("url", None)
+        self.collection_name = vector_store_config.get("collection_name", None)
+        self.reset_collection = vector_store_config.get("reset_collection", False)
+        self.enable_full_text_search = vector_store_config.get(
+            "enable_full_text_search", False
+        )
diff --git a/knowledge server/datasource.yaml b/knowledge server/datasource.yaml
@@ -0,0 +1,3 @@
+datasource:
+  - type: directory
+    path: ../datasets/
diff --git a/knowledge server/docker-compose.yml b/knowledge server/docker-compose.yml
@@ -0,0 +1,67 @@
+#https://github.com/milvus-io/milvus/releases/download/v2.6.7/milvus-standalone-docker-compose.yml
+version: '3.5'
+
+services:
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.5.18
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://etcd:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2024-12-18T13-15-44Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9001:9001"
+      - "9000:9000"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9001"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  standalone:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:v2.6.6
+    command: ["milvus", "run", "standalone"]
+    security_opt:
+    - seccomp:unconfined
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+      MQ_TYPE: woodpecker
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+      interval: 30s
+      start_period: 90s
+      timeout: 20s
+      retries: 3
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    depends_on:
+      - "etcd"
+      - "minio"
+
+networks:
+  default:
+    name: milvus
diff --git a/knowledge server/loader/datasource.py b/knowledge server/loader/datasource.py
@@ -0,0 +1,31 @@
+import logging
+
+from langchain_core.document_loaders.base import BaseLoader
+from loader.directory import DirectoryLoader
+
+
+class Datasource:
+    type: str
+    path: str
+    url: str
+
+    def __init__(self, type: str, path: str = "", url: str = ""):
+        self.type = type
+        self.path = path
+        self.url = url
+
+
+class DatasourceLoader(BaseLoader):
+    loader: BaseLoader
+
+    def __init__(self, datasource: Datasource, logger: logging.Logger):
+        if datasource.type == "directory":
+            self.loader = DirectoryLoader(datasource.path, logger)
+        else:
+            raise ValueError(f"Unsupported source type: {datasource.type}")
+
+    def lazy_load(self):
+        return self.loader.lazy_load()
+
+    def load(self):
+        return self.loader.load()