Skip to content

Commit e142784

Browse files
committed
feat(all): implement idle process management for MCP servers
1 parent 7e1789b commit e142784

18 files changed

+918
-42
lines changed

services/backend/src/events/satellite/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ const handlerModules = [
2222
() => import('./mcp-tool-executed'),
2323
() => import('./mcp-server-crashed'),
2424
() => import('./mcp-client-activity'),
25+
() => import('./mcp-server-dormant'),
26+
() => import('./mcp-server-respawned'),
27+
() => import('./mcp-server-restarted'),
2528
// Add new handlers here - they will be automatically registered
2629
];
2730

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/**
2+
* MCP Server Dormant Event Handler
3+
*
4+
* Updates satelliteProcesses table when an MCP server process goes dormant due to inactivity
5+
*/
6+
7+
import type { LibSQLDatabase } from 'drizzle-orm/libsql';
8+
import { satelliteProcesses } from '../../db/schema.sqlite';
9+
import { eq } from 'drizzle-orm';
10+
11+
// Event type identifier
12+
export const EVENT_TYPE = 'mcp.server.dormant';
13+
14+
// JSON Schema for Fastify validation
15+
export const SCHEMA = {
16+
type: 'object',
17+
properties: {
18+
server_id: {
19+
type: 'string',
20+
minLength: 1,
21+
description: 'MCP server identifier (installation_id)'
22+
},
23+
server_slug: {
24+
type: 'string',
25+
minLength: 1,
26+
description: 'MCP server slug (installation_name)'
27+
},
28+
team_id: {
29+
type: 'string',
30+
minLength: 1,
31+
description: 'Team identifier'
32+
},
33+
process_id: {
34+
type: 'number',
35+
description: 'Operating system process ID'
36+
},
37+
idle_duration_seconds: {
38+
type: 'number',
39+
minimum: 0,
40+
description: 'Duration of inactivity before going dormant'
41+
},
42+
last_activity_at: {
43+
type: 'string',
44+
format: 'date-time',
45+
description: 'Last activity timestamp'
46+
}
47+
},
48+
required: ['server_id', 'server_slug', 'team_id', 'process_id', 'idle_duration_seconds', 'last_activity_at'],
49+
additionalProperties: true
50+
} as const;
51+
52+
// TypeScript interface for type safety
53+
interface ServerDormantData {
54+
server_id: string;
55+
server_slug: string;
56+
team_id: string;
57+
process_id: number;
58+
idle_duration_seconds: number;
59+
last_activity_at: string;
60+
}
61+
62+
/**
63+
* Handle mcp.server.dormant event
64+
*
65+
* Updates the satelliteProcesses table to mark the process as terminated (dormant).
66+
* This event is emitted when a satellite terminates an idle stdio MCP server to save resources.
67+
*/
68+
export async function handle(
69+
satelliteId: string,
70+
eventData: Record<string, unknown>,
71+
db: LibSQLDatabase,
72+
eventTimestamp: Date
73+
): Promise<void> {
74+
const data = eventData as unknown as ServerDormantData;
75+
76+
// Update process status to stopped (dormant state)
77+
await db
78+
.update(satelliteProcesses)
79+
.set({
80+
status: 'stopped',
81+
health_status: 'unknown',
82+
stopped_at: eventTimestamp,
83+
updated_at: new Date()
84+
})
85+
.where(eq(satelliteProcesses.id, data.server_id));
86+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/**
2+
* MCP Server Respawned Event Handler
3+
*
4+
* Updates satelliteProcesses table when a dormant MCP server is respawned
5+
*/
6+
7+
import type { LibSQLDatabase } from 'drizzle-orm/libsql';
8+
import { satelliteProcesses } from '../../db/schema.sqlite';
9+
import { eq } from 'drizzle-orm';
10+
11+
// Event type identifier
12+
export const EVENT_TYPE = 'mcp.server.respawned';
13+
14+
// JSON Schema for Fastify validation
15+
export const SCHEMA = {
16+
type: 'object',
17+
properties: {
18+
server_id: {
19+
type: 'string',
20+
minLength: 1,
21+
description: 'MCP server identifier (installation_id)'
22+
},
23+
server_slug: {
24+
type: 'string',
25+
minLength: 1,
26+
description: 'MCP server slug (installation_name)'
27+
},
28+
team_id: {
29+
type: 'string',
30+
minLength: 1,
31+
description: 'Team identifier'
32+
},
33+
process_id: {
34+
type: 'number',
35+
description: 'New operating system process ID'
36+
},
37+
dormant_duration_seconds: {
38+
type: 'number',
39+
minimum: 0,
40+
description: 'Duration process was dormant'
41+
},
42+
respawn_duration_ms: {
43+
type: 'number',
44+
minimum: 0,
45+
description: 'Time taken to respawn process in milliseconds'
46+
}
47+
},
48+
required: ['server_id', 'server_slug', 'team_id', 'process_id', 'dormant_duration_seconds', 'respawn_duration_ms'],
49+
additionalProperties: true
50+
} as const;
51+
52+
// TypeScript interface for type safety
53+
interface ServerRespawnedData {
54+
server_id: string;
55+
server_slug: string;
56+
team_id: string;
57+
process_id: number;
58+
dormant_duration_seconds: number;
59+
respawn_duration_ms: number;
60+
}
61+
62+
/**
63+
* Handle mcp.server.respawned event
64+
*
65+
* Updates the satelliteProcesses table to mark the process as running again.
66+
* This event is emitted when a satellite automatically respawns a dormant stdio MCP server.
67+
*/
68+
export async function handle(
69+
satelliteId: string,
70+
eventData: Record<string, unknown>,
71+
db: LibSQLDatabase,
72+
eventTimestamp: Date
73+
): Promise<void> {
74+
const data = eventData as unknown as ServerRespawnedData;
75+
76+
// Update process status back to running with new PID
77+
await db
78+
.update(satelliteProcesses)
79+
.set({
80+
status: 'running',
81+
process_pid: data.process_id,
82+
health_status: 'healthy',
83+
started_at: eventTimestamp,
84+
updated_at: new Date()
85+
})
86+
.where(eq(satelliteProcesses.id, data.server_id));
87+
}
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/**
2+
* MCP Server Restarted Event Handler
3+
*
4+
* Updates satelliteProcesses table when an MCP server is restarted after a crash
5+
*/
6+
7+
import type { LibSQLDatabase } from 'drizzle-orm/libsql';
8+
import { satelliteProcesses } from '../../db/schema.sqlite';
9+
import { eq } from 'drizzle-orm';
10+
11+
// Event type identifier
12+
export const EVENT_TYPE = 'mcp.server.restarted';
13+
14+
// JSON Schema for Fastify validation
15+
export const SCHEMA = {
16+
type: 'object',
17+
properties: {
18+
server_id: {
19+
type: 'string',
20+
minLength: 1,
21+
description: 'MCP server identifier (installation_id)'
22+
},
23+
server_slug: {
24+
type: 'string',
25+
minLength: 1,
26+
description: 'MCP server slug (installation_name)'
27+
},
28+
team_id: {
29+
type: 'string',
30+
minLength: 1,
31+
description: 'Team identifier'
32+
},
33+
old_process_id: {
34+
type: 'number',
35+
description: 'Previous operating system process ID'
36+
},
37+
new_process_id: {
38+
type: 'number',
39+
description: 'New operating system process ID after restart'
40+
},
41+
restart_reason: {
42+
type: 'string',
43+
enum: ['crash', 'health_check_failed'],
44+
description: 'Reason for the restart'
45+
},
46+
attempt_number: {
47+
type: 'number',
48+
minimum: 1,
49+
maximum: 3,
50+
description: 'Restart attempt number (1-3)'
51+
}
52+
},
53+
required: ['server_id', 'server_slug', 'team_id', 'old_process_id', 'new_process_id', 'restart_reason', 'attempt_number'],
54+
additionalProperties: true
55+
} as const;
56+
57+
// TypeScript interface for type safety
58+
interface ServerRestartedData {
59+
server_id: string;
60+
server_slug: string;
61+
team_id: string;
62+
old_process_id: number;
63+
new_process_id: number;
64+
restart_reason: 'crash' | 'health_check_failed';
65+
attempt_number: number;
66+
}
67+
68+
/**
69+
* Handle mcp.server.restarted event
70+
*
71+
* Updates the satelliteProcesses table to mark the process as running again with new PID.
72+
* This event is emitted when a satellite automatically restarts a crashed MCP server.
73+
*/
74+
export async function handle(
75+
satelliteId: string,
76+
eventData: Record<string, unknown>,
77+
db: LibSQLDatabase,
78+
eventTimestamp: Date
79+
): Promise<void> {
80+
const data = eventData as unknown as ServerRestartedData;
81+
82+
// Update process status back to running with new PID
83+
await db
84+
.update(satelliteProcesses)
85+
.set({
86+
status: 'running',
87+
process_pid: data.new_process_id,
88+
health_status: 'healthy',
89+
started_at: eventTimestamp,
90+
error_message: null, // Clear previous error
91+
updated_at: new Date()
92+
})
93+
.where(eq(satelliteProcesses.id, data.server_id));
94+
}

services/satellite/.env.example

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,3 +70,10 @@ NSJAIL_CPU_TIME_LIMIT_SECONDS=60
7070

7171
# Maximum number of processes per MCP server (default: 50)
7272
NSJAIL_MAX_PROCESSES=50
73+
74+
# Process Idle Timeout (stdio MCP servers only)
75+
# Idle stdio processes are automatically terminated after this duration to save resources
76+
# Processes are transparently respawned when API calls arrive (1-3s latency)
77+
# Set to 0 to disable idle timeout (processes never sleep)
78+
# Default: 180 seconds (3 minutes)
79+
MCP_PROCESS_IDLE_TIMEOUT_SECONDS=180

services/satellite/README.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ EVENT_FLUSH_TIMEOUT_MS=5000 # Graceful shutdown flush timeout in millise
110110
NSJAIL_MEMORY_LIMIT_MB=50 # Memory limit per MCP server process in MB (default: 50)
111111
NSJAIL_CPU_TIME_LIMIT_SECONDS=60 # CPU time limit per MCP server process in seconds (default: 60)
112112
NSJAIL_MAX_PROCESSES=50 # Maximum number of processes per MCP server (default: 50)
113+
114+
# Process Idle Timeout (stdio MCP servers only)
115+
MCP_PROCESS_IDLE_TIMEOUT_SECONDS=180 # Idle timeout in seconds before terminating stdio processes (default: 180, set to 0 to disable)
113116
```
114117

115118
### nsjail Resource Limits
@@ -140,6 +143,54 @@ These limits control resource allocation for MCP server processes running in nsj
140143
- Allows easier debugging on macOS, Windows, and Linux
141144
- Full isolation only active in production Linux deployments
142145

146+
### Process Idle Timeout (stdio MCP servers only)
147+
148+
**MCP_PROCESS_IDLE_TIMEOUT_SECONDS** (Default: 180)
149+
150+
Automatically terminates idle stdio MCP server processes to save resources. Dormant processes are transparently respawned when API calls arrive.
151+
152+
**How It Works:**
153+
- Background job checks all stdio processes every 30 seconds
154+
- Processes idle longer than threshold are gracefully terminated
155+
- Process configurations stored in memory for automatic respawning
156+
- When API call arrives for dormant process, it respawns automatically (1-3s latency)
157+
158+
**Benefits:**
159+
- **Memory Savings**: ~50-100MB per dormant process
160+
- **CPU Savings**: Zero overhead for idle processes
161+
- **Transparent**: MCP clients unaware of sleep/wake cycle
162+
- **Team Isolation**: Maintained (separate processes per team)
163+
164+
**Configuration Options:**
165+
- `180` (default): 3 minutes idle timeout
166+
- `60`: 1 minute idle timeout (aggressive, for high-density deployments)
167+
- `600`: 10 minutes idle timeout (conservative, for frequently-used servers)
168+
- `0`: Disable idle timeout (processes never sleep)
169+
170+
**Edge Cases Handled:**
171+
- Only terminates processes with `status=running`
172+
- Skips processes with active requests in flight
173+
- Prevents concurrent respawn attempts
174+
- Dormant processes excluded from heartbeat reports
175+
176+
**Monitoring:**
177+
```bash
178+
# Check idle process activity
179+
grep "idle_process_check_completed" logs/satellite.log
180+
181+
# Track dormant transitions
182+
grep "process_marked_dormant" logs/satellite.log
183+
184+
# Monitor respawn operations
185+
grep "dormant_process_respawned" logs/satellite.log
186+
```
187+
188+
**When to Adjust:**
189+
- **High-density deployments**: Lower timeout (60-120s) to maximize resource savings
190+
- **Frequently-used servers**: Higher timeout (300-600s) to avoid unnecessary respawns
191+
- **Development/testing**: Disable (0) to avoid respawn delays during debugging
192+
- **Production with ample resources**: Keep default (180s) for balanced operation
193+
143194
### Required Environment Variables
144195

145196
**DEPLOYSTACK_SATELLITE_NAME** (Mandatory)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/**
2+
* Process management configuration
3+
*/
4+
5+
/**
6+
* Idle timeout for stdio MCP server processes
7+
* Processes inactive for longer than this will be terminated and marked dormant
8+
* They will be automatically respawned when needed
9+
*
10+
* Default: 180 seconds (3 minutes)
11+
* Configure via: MCP_PROCESS_IDLE_TIMEOUT_SECONDS environment variable
12+
*/
13+
export const IDLE_TIMEOUT_MS = parseInt(
14+
process.env.MCP_PROCESS_IDLE_TIMEOUT_SECONDS || '180',
15+
10
16+
) * 1000;
17+
18+
/**
19+
* Grace period after process spawn during which it cannot be marked idle
20+
* This prevents newly spawned processes from being terminated before they finish initialization
21+
*
22+
* Default: 60 seconds
23+
* Configure via: MCP_PROCESS_SPAWN_GRACE_PERIOD_SECONDS environment variable
24+
*/
25+
export const SPAWN_GRACE_PERIOD_MS = parseInt(
26+
process.env.MCP_PROCESS_SPAWN_GRACE_PERIOD_SECONDS || '60',
27+
10
28+
) * 1000;

0 commit comments

Comments
 (0)