draphonix · linhlban150612 · Nov 21, 2025
diff --git a/.env.example b/.env.example
@@ -12,3 +12,5 @@ GEMINI_API_KEY=your-gemini-api-key
 
 # Optional OpenAI key if you want to use GPT-based LMs with DSpy.
 # OPENAI_API_KEY=your-openai-api-key
+# OPENAI_API_BASE=your-openai-api-base-url
+# OPENAI_MODEL=gpt-4o-mini
diff --git a/AGENTS.md b/AGENTS.md
@@ -1,17 +1,28 @@
 # mcp-code-mode Agent Guide
 
-## Quick Links
-- [Documentation & User Guide](file:///Users/themrb/Documents/personal/mcp-code-mode/docs/README.md)
-- [Roadmap](file:///Users/themrb/Documents/personal/mcp-code-mode/docs/ROADMAP.md)
-
 ## Commands
-- **Install**: `python -m venv .venv && source .venv/bin/activate && pip install -e .[dev]`
-- **Run Server**: `python -m mcp_code_mode.executor_server`
-- **Test**: `pytest` or single file `pytest tests/test_executor.py`
-- **Lint**: `ruff check .` (fix with `--fix`), `black .`, `mypy src`
-- **Verify**: `python scripts/test_dspy_sandbox.py` (sanity check)
 
-## Guidelines
-- **Style**: Python 3.11+, Black formatting, Ruff linting, comprehensive type hints (`mypy`).
-- **Conventions**: Use absolute imports from `mcp_code_mode`. Prefer `async/await`.
-- **Env**: Copy `.env.example` to `.env`.
+- **Setup**: `python -m venv .venv && source .venv/bin/activate && pip install -e .[dev]`
+- **Server**: `python -m mcp_code_mode.executor_server`
+- **Test All**: `pytest` or with coverage `pytest --cov=mcp_code_mode`
+- **Test Single**: `pytest tests/test_executor.py` or `pytest tests/test_executor.py::test_name`
+- **Lint**: `ruff check .` (auto-fix: `ruff check . --fix`), `black .`, `mypy src`
+- **Verify**: `python scripts/test_dspy_sandbox.py` (integration sanity check)
+
+## Code Style
+
+- **Python**: 3.11-3.12 only (strict >=3.11,<3.13)
+- **Formatting**: Black (line length 88), Ruff linting, full type hints for mypy --strict
+- **Imports**: Absolute from `mcp_code_mode` (e.g., `from mcp_code_mode.executor import ...`)
+- **Async**: Prefer `async/await` over threads; use `asyncio.run()` for entry points
+- **Types**: Use `TypedDict`, `dataclass`, type annotations everywhere; avoid `Any` unless unavoidable
+- **Errors**: Raise specific exceptions (e.g., `RuntimeError`, `ValueError`), include context in messages
+- **Naming**: snake_case functions/vars, PascalCase classes, UPPER_SNAKE constants, descriptive names
+- **Tests**: Use `pytest`, `pytest-asyncio` for async, stub external deps (see `StubInterpreter` pattern)
+- **Docstrings**: Module-level for non-trivial files, class/function docstrings for public APIs
+
+## Context
+
+- **Purpose**: MCP server exposing DSpy sandbox (Deno + Pyodide) for secure code execution
+- **Key files**: `executor.py` (sandbox), `agent.py` (DSpy CodeAct), `executor_server.py` (MCP server)
+- **Env**: Copy `.env.example` to `.env` for API keys (Gemini, etc.)
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
diff --git a/GEMINI.md b/GEMINI.md
@@ -0,0 +1 @@
+AGENTS.md
diff --git a/mcp_servers.json b/mcp_servers.json
@@ -1,10 +1,10 @@
 {
   "servers": {
-    "filesystem": {
-      "command": "npx",
-      "args": ["-y", "@modelcontextprotocol/server-filesystem", "/Users/themrb/Documents/personal/mcp-code-mode/docs"],
-      "description": "Local file system operations inside the repo",
-      "env": {}
+    "fetch": {
+      "command": "uvx",
+      "args": [
+        "mcp-server-fetch"
+      ]
     },
     "memory": {
       "command": "npx",

diff --git a/src/mcp_code_mode.egg-info/PKG-INFO b/src/mcp_code_mode.egg-info/PKG-INFO
@@ -8,6 +8,10 @@ Description-Content-Type: text/markdown
 Requires-Dist: fastmcp>=2.0.0
 Requires-Dist: dspy-ai>=2.5.0
 Requires-Dist: mcp>=1.0.0
+Requires-Dist: aiohttp>=3.9.0
+Requires-Dist: httpx>=0.24.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: google-generativeai>=0.3.0
 Provides-Extra: dev
 Requires-Dist: pytest>=7.4.0; extra == "dev"
 Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -18,78 +22,172 @@ Requires-Dist: mypy>=1.7.0; extra == "dev"
 
 # MCP Code Mode
 
-Prototype implementation for the Code Execution MCP Server with DSpy. This repo follows the implementation plan in `docs/implementation-plan.md`.
+Prototype implementation for the Code Execution MCP Server with DSpy. The "Code Execution with MCP" architecture combines the strengths of Large Language Models at code generation with the Model Context Protocol for tool integration. This system enables an AI agent to write Python code that runs in an isolated sandbox while seamlessly calling external MCP tools.
 
-## Toolchain Requirements
-
-- Python 3.11 (3.11.0 or newer, <3.13 recommended)
-- Node.js 20+ with `npx` available (needed for the reference MCP servers)
-- `pip` for installing the Python dependencies listed in `pyproject.toml` / `requirements*.txt`
 
 ## Quick Start
 
+### 1. Installation
+Requires Python 3.11+ and Node.js 20+.
+
 ```bash
+# Create virtual environment
 python3.11 -m venv .venv
 source .venv/bin/activate
-pip install -r requirements-dev.txt
-pip install -e .
+
+# Install dependencies
+pip install -e .[dev]
+
+# Install Node.js dependencies for reference servers
+npm install -g npm@latest
 ```
 
-To keep the Node-based MCP servers current, run:
+### 2. Configuration
+Copy the example environment file and configure secrets:
+```bash
+cp .env.example .env
+```
 
+Configure your MCP servers in `mcp_servers.json`:
+```json
+{
+  "servers": {
+    "filesystem": {
+      "command": "npx",
+      "args": ["-y", "@modelcontextprotocol/server-filesystem", "/your-working-folder"],
+      "description": "Local file system operations"
+    }
+  }
+}
+```
+
+### 3. Running the Server
+Launch the Code Execution MCP server:
 ```bash
-npm install -g npm@latest
+python -m mcp_code_mode.executor_server
 ```
 
-The `mcp_servers.json` file enumerates the default MCP servers (filesystem, memory, fetch). Update this file to point at any additional servers you want available during experimentation.
+### 4. Verification
+Verify your setup by running the debug executor script. This script simulates an MCP client, connects to the server, and runs a test task to ensure the agent and tools are working correctly.
 
-## Phase 1 Executor Server
+Before running the script:
+1. Configure the MCP servers you want to interact with in `mcp_servers.json`.
+2. Define the specific task you want the agent to perform by editing the `task` variable in `scripts/debug_executor.py`.
 
-The Phase 1 milestone introduces a minimal FastMCP server that exposes a single `execute_code` tool backed by DSpy's sandboxed Python interpreter.
+```bash
+python scripts/debug_executor.py
+```
 
-1. Activate your virtual environment.
-2. Launch the server with:
-   ```bash
-   python -m mcp_code_mode.executor_server
-   ```
-3. Point an MCP-compatible client at the process (stdio transport) and call the `execute_code` tool with arbitrary Python snippets.
+## Development Commands
 
-Every invocation returns a structured payload:
+| Command | Description |
+|---------|-------------|
+| `pytest` | Run all tests |
+| `ruff check .` | Lint the codebase |
+| `black .` | Format the codebase |
+| `mypy src` | Type check the source |
+| `python scripts/test_dspy_sandbox.py` | Sanity check the sandbox |
+| `python scripts/debug_executor.py` | Integration test with mock client |
 
-| Field | Description |
-|-------|-------------|
-| `success` | `True` if the snippet finished without exceptions or timeouts. |
-| `stdout` / `stderr` | Captured output streams (truncated to 64 kB). |
-| `duration_ms` | Total runtime in milliseconds. |
-| `diagnostics` | Optional metadata describing errors/timeouts. |
+## Execution Environment & Guardrails
 
-Timeouts and invalid arguments are reported cleanly, and failures are echoed through the FastMCP context log for easier debugging.
+By default, the system uses a **Local Python Executor** (`LocalPythonExecutor`) which runs code in the same process as the server. This is necessary because the strict Pyodide sandbox has limitations with network I/O, preventing it from calling back to other MCP tools in some environments.
 
-## Testing Status
+### Guardrails
+Even with the local executor, the system enforces policies before code execution:
+- **Limits**: 8k characters / 400 lines max.
+- **Imports**: Allowlist only (`json`, `math`, `re`, `datetime`, etc.).
+- **Tokens**: Disallows potentially dangerous tokens (`subprocess`, `exec`, `eval`).
 
-The Phase 1 executor server has been tested with the following scenarios:
+Violations return a `POLICY_VIOLATION` error.
 
-### ✅ Completed Tests
+> **Note**: You can force the use of the Pyodide sandbox by setting `MCP_EXECUTOR=pyodide`, but this may break tool calls depending on your environment.
 
-1. **Basic Execution**: Successfully executes simple Python snippets with correct stdout capture
-   - Test: `print('hello from sandbox')`
-   - Result: `{"success":true,"stdout":"hello from sandbox\n","stderr":"","duration_ms":1978,"diagnostics":null}`
+## Architecture
 
-2. **Error Handling**: Properly captures and reports Python exceptions with diagnostic information
-   - Test: `raise ValueError("boom")`
-   - Result: `{"success":false,"stdout":"","stderr":"ValueError: ['boom']","duration_ms":20,"diagnostics":{"error_type":"InterpreterError","traceback":"..."}}`
+### Overview
 
-3. **Timeout Detection**: Correctly detects and reports execution timeouts
-   - Test: `while True: pass` (2s timeout)
-   - Result: `{"success":false,"stdout":"","stderr":"Execution timed out after 2.00s","duration_ms":2001,"diagnostics":{"error_type":"TIMEOUT","timeout_seconds":2.0}}`
+```
+┌─────────────────────────────────────────────────────────────┐
+│                   MCP Client (Claude, etc.)                  │
+└────────────────────────┬────────────────────────────────────┘
+                         │ MCP Protocol (stdio/HTTP/SSE)
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    FastMCP Server                            │
+│  ┌──────────────────────────────────────────────────────┐  │
+│  │  @mcp.tool                                           │  │
+│  │  async def execute_code(code: str):                 │  │
+│  │      # 1. Execute in Local Executor (default)       │  │
+│  │      result = await executor.run(code)              │  │
+│  │      return result                                   │  │
+│  └──────────────────────────────────────────────────────┘  │
+└────────────────────────┬────────────────────────────────────┘
+                         │
+                         ▼
+          ┌──────────────────────────────┐
+          │    Execution Engine:         │
+          │  • LocalPythonExecutor       │
+          │    (or Pyodide Sandbox)      │
+          └──────────────────────────────┘
+```
+
+### Why Code Mode?
+
+Traditional MCP implementations face critical challenges:
+1. **Context Window Bloat**: Every tool definition consumes tokens, limiting scalability.
+2. **Token Cost**: Multiple back-and-forth tool calls are expensive.
+3. **Latency**: Sequential tool invocations create cumulative delays.
+4. **Composability**: Complex workflows require many discrete steps.
+
+Code Mode addresses these by leveraging what LLMs excel at: writing code. Rather than making multiple tool calls, the agent writes a Python script that orchestrates all necessary operations internally.
+
+### Core Components
+
+1. **The Executor Server (FastMCP)** (`src/mcp_code_mode/executor_server.py`)
+   The server exposes an `execute_code` tool backed by a Python executor (Local or Pyodide). Uses `fastmcp` to handle the MCP protocol and `dspy` for execution logic.
 
-### ⚠️ Known Issues
+2. **Configuration-Driven Discovery** (`mcp_servers.json`)
+   The system uses `mcp_servers.json` to explicitly configure which MCP servers to connect to. Loaded by `src/mcp_code_mode/mcp_manager.py`.
+
+3. **Tool Schema Formatting** (`src/mcp_code_mode/tool_formatter.py`)
+   Formats discovered MCP tools into readable documentation that gets passed to the code generation LLM, so it knows what tools exist.
+
+4. **Context Injection**
+   The formatted tool schemas are passed as an input field to the LLM. The LLM knows tool names, parameters, and usage examples *before* it writes the code.
+
+### Information Flow
+
+```
+1. mcp_servers.json (Defines servers)
+   ↓
+2. MCPServerManager.initialize()
+   ├─ Connect to configured servers
+   ├─ Call list_tools() on each
+   └─ Convert to DSpy tools
+   ↓
+3. ToolSchemaFormatter.format_for_llm()
+   └─ Creates readable documentation
+   ↓
+4. CodeExecutionAgent
+   └─ Stores both callable tools and schemas
+   ↓
+5. Agent Generation
+   └─ Passes tool_context to LLM
+   ↓
+6. Code Execution
+   └─ Code runs in sandbox, calling actual tools via MCP
+```
+### Troubleshooting
 
-1. **Interpreter State Management**: After a timeout occurs, the interpreter instance enters a bad state where all subsequent executions immediately timeout. This requires disconnecting and reconnecting to the MCP server to obtain a fresh interpreter instance.
+**Timeout Issues**:
+If the interpreter times out, it may enter a bad state. Currently, the best fix is to restart the server or reconnect the client to get a fresh interpreter instance.
 
-### 🔄 Next Steps
+**Missing Tools**:
+Ensure `mcp_servers.json` paths are correct and that you have run `npm install` if using Node-based servers.
 
-1. Fix interpreter state management after timeouts
-2. Implement proper interpreter recycling/recreation
-3. Add tool formatter + integration utilities for Phase 2
-4. Enable generated code to discover/use remote MCP tools
+## References
+- [DSpy Documentation](https://dspy.ai)
+- [Model Context Protocol](https://modelcontextprotocol.io)
+- [FastMCP](https://github.com/jlowin/fastmcp)
+# mcp-code-mode
diff --git a/src/mcp_code_mode.egg-info/SOURCES.txt b/src/mcp_code_mode.egg-info/SOURCES.txt
@@ -7,15 +7,20 @@ src/mcp_code_mode/executor.py
 src/mcp_code_mode/executor_server.py
 src/mcp_code_mode/mcp_integration.py
 src/mcp_code_mode/mcp_manager.py
+src/mcp_code_mode/policies.py
+src/mcp_code_mode/sandbox_config.py
+src/mcp_code_mode/tool_bridge.py
 src/mcp_code_mode/tool_formatter.py
 src/mcp_code_mode/validate_agent.py
 src/mcp_code_mode.egg-info/PKG-INFO
 src/mcp_code_mode.egg-info/SOURCES.txt
 src/mcp_code_mode.egg-info/dependency_links.txt
 src/mcp_code_mode.egg-info/requires.txt
 src/mcp_code_mode.egg-info/top_level.txt
-tests/test_agent_injection.py
+tests/test_agent.py
 tests/test_executor.py
 tests/test_mcp_integration.py
 tests/test_mcp_manager_config.py
+tests/test_policies.py
+tests/test_tool_bridge_runtime.py
 tests/test_tool_formatter.py
diff --git a/src/mcp_code_mode.egg-info/requires.txt b/src/mcp_code_mode.egg-info/requires.txt
@@ -1,6 +1,10 @@
 fastmcp>=2.0.0
 dspy-ai>=2.5.0
 mcp>=1.0.0
+aiohttp>=3.9.0
+httpx>=0.24.0
+python-dotenv>=1.0.0
+google-generativeai>=0.3.0
 
 [dev]
 pytest>=7.4.0

diff --git a/src/mcp_code_mode/example_usage.py b/src/mcp_code_mode/example_usage.py
@@ -42,8 +42,17 @@ async def main():
     # 1. Configure DSpy
     gemini_key = os.environ.get("GEMINI_API_KEY")
     openai_key = os.environ.get("OPENAI_API_KEY")
-
-    if gemini_key:
+    openai_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+    openai_model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
+
+    # Prefer OpenAI for speed/reliability if available, then Gemini
+    if openai_key:
+        # Ensure we use the openai provider prefix for LiteLLM
+        model = openai_model if openai_model.startswith("openai/") else f"openai/{openai_model}"
+        lm = dspy.LM(model, api_key=openai_key, api_base=openai_base)
+        dspy.configure(lm=lm)
+        print(f"✅ DSpy configured with OpenAI ({openai_model}){' at ' + openai_base if openai_base else ''}")
+    elif gemini_key:
         # Use dspy.LM with gemini/ prefix which uses litellm under the hood
         try:
             # Note: dspy.Google is deprecated/removed in newer versions, use dspy.LM
@@ -53,10 +62,6 @@ async def main():
         except Exception as e:
             print(f"❌ Failed to configure Gemini: {e}")
             return
-    elif openai_key:
-        lm = dspy.LM("openai/gpt-4o-mini", api_key=openai_key)
-        dspy.configure(lm=lm)
-        print("✅ DSpy configured with OpenAI (gpt-4o-mini)")
     else:
         print("❌ No API key found. Please set GEMINI_API_KEY or OPENAI_API_KEY.")
         return

diff --git a/src/mcp_code_mode/executor_server.py b/src/mcp_code_mode/executor_server.py
@@ -45,12 +45,15 @@ async def server_lifespan(server: FastMCP) -> AsyncIterator[None]:
     # We default to gpt-4o-mini if available, or let dspy auto-configure if env vars are set
     gemini_key = os.environ.get("GEMINI_API_KEY")
     openai_key = os.environ.get("OPENAI_API_KEY")
+    openai_base = os.environ.get("OPENAI_API_BASE", "https://api.openai.com/v1")
+    openai_model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini")
 
     # Prefer OpenAI for speed/reliability if available, then Gemini
     if openai_key:
-        lm = dspy.LM("openai/gpt-4o-mini", api_key=openai_key)
+        # Ensure we use the openai provider prefix for LiteLLM
+        lm = dspy.LM(openai_model, api_key=openai_key, api_base=openai_base)
         dspy.configure(lm=lm)
-        print("✅ DSpy configured with OpenAI (gpt-4o-mini)", file=sys.stderr)
+        print(f"✅ DSpy configured with OpenAI ({openai_model}) at {openai_base}", file=sys.stderr)
     elif gemini_key:
         # Use dspy.LM with gemini/ prefix which uses litellm under the hood
         try:

diff --git a/src/mcp_code_mode/sandbox_config.py b/src/mcp_code_mode/sandbox_config.py
@@ -11,6 +11,8 @@
 # Only a minimal set of environment variables may leak into the sandbox.
 _ENV_ALLOWLIST = (
     "OPENAI_API_KEY",
+    "OPENAI_API_BASE",
+    "OPENAI_MODEL",
     "GEMINI_API_KEY",
     "GOOGLE_API_KEY",
 )