Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 31 additions & 20 deletions src/gateway/process.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,17 @@ export async function findExistingGatewayProcess(sandbox: Sandbox): Promise<Proc
*
* @param sandbox - The sandbox instance
* @param env - Worker environment bindings
* @param options.waitForReady - If false, start the process but don't wait for port.
* Used by /api/status to avoid exceeding the Worker CPU limit. Default: true.
* @returns The running gateway process, or null if the gateway is up but we
* don't have a process handle (detected via port probe only)
*/
export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise<Process | null> {
export async function ensureGateway(
sandbox: Sandbox,
env: OpenClawEnv,
options?: { waitForReady?: boolean },
): Promise<Process | null> {
const waitForReady = options?.waitForReady !== false;
// Check if gateway is already running or starting
const existingProcess = await findExistingGatewayProcess(sandbox);
if (existingProcess) {
Expand Down Expand Up @@ -174,28 +181,32 @@ export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise
throw startErr;
}

// Wait for the gateway to be ready
try {
console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT);
await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS });
console.log('[Gateway] OpenClaw gateway is ready!');

const logs = await process.getLogs();
if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout);
if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr);
} catch (e) {
console.error('[Gateway] waitForPort failed:', e);
if (waitForReady) {
// Wait for the gateway to be ready
try {
console.log('[Gateway] Waiting for OpenClaw gateway to be ready on port', GATEWAY_PORT);
await process.waitForPort(GATEWAY_PORT, { mode: 'tcp', timeout: STARTUP_TIMEOUT_MS });
console.log('[Gateway] OpenClaw gateway is ready!');

const logs = await process.getLogs();
console.error('[Gateway] startup failed. Stderr:', logs.stderr);
console.error('[Gateway] startup failed. Stdout:', logs.stdout);
throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, {
cause: e,
});
} catch (logErr) {
console.error('[Gateway] Failed to get logs:', logErr);
throw e;
if (logs.stdout) console.log('[Gateway] stdout:', logs.stdout);
if (logs.stderr) console.log('[Gateway] stderr:', logs.stderr);
} catch (e) {
console.error('[Gateway] waitForPort failed:', e);
try {
const logs = await process.getLogs();
console.error('[Gateway] startup failed. Stderr:', logs.stderr);
console.error('[Gateway] startup failed. Stdout:', logs.stdout);
throw new Error(`OpenClaw gateway failed to start. Stderr: ${logs.stderr || '(empty)'}`, {
cause: e,
});
} catch (logErr) {
console.error('[Gateway] Failed to get logs:', logErr);
throw e;
}
}
} else {
console.log('[Gateway] Process started (not waiting for ready):', process.id);
}

// Verify gateway is actually responding
Expand Down
31 changes: 24 additions & 7 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ app.use('*', async (c, next) => {
return next();
}

// Skip validation in dev mode
if (c.env.DEV_MODE === 'true') {
// Skip validation in dev/test mode
if (c.env.DEV_MODE === 'true' || c.env.E2E_TEST_MODE === 'true') {
return next();
}

Expand Down Expand Up @@ -262,10 +262,27 @@ app.all('*', async (c) => {
const isWebSocketRequest = request.headers.get('Upgrade')?.toLowerCase() === 'websocket';
const acceptsHtml = request.headers.get('Accept')?.includes('text/html');

// For browser HTML requests, always try the proxy first but with a fallback
// to the loading page. This avoids calling listProcesses() which can hang
// on cold start (the DO RPC takes 30-60s and kills the Worker via CPU limit).
// The loading page polls /api/status which handles restore + gateway start.
// For browser HTML requests, check if the gateway is running before proxying.
// If not running, serve the loading page immediately. The loading page polls
// /api/status which handles restore + gateway start. We use a very short timeout
// (3s) on findExistingGatewayProcess to avoid blocking — if it doesn't respond,
// we assume the gateway isn't ready.
if (!isWebSocketRequest && acceptsHtml) {
let gatewayReady = false;
try {
const proc = await Promise.race([
findExistingGatewayProcess(sandbox),
new Promise<null>((resolve) => setTimeout(() => resolve(null), 3_000)),
]);
gatewayReady = proc !== null && proc.status === 'running';
} catch {
// Treat as not ready
}
if (!gatewayReady) {
console.log('[PROXY] Gateway not ready for HTML request, serving loading page');
return c.html(loadingPageHtml);
}
}

// For non-WebSocket, non-HTML requests (API calls, static assets), we need
// the gateway to be running. Restore first, then start.
Expand Down Expand Up @@ -497,7 +514,7 @@ app.all('*', async (c) => {
console.log('[HTTP] Response status:', httpResponse.status);

// For HTML requests, verify we got actual content from the gateway.
// containerFetch can return a 200 with empty body if the gateway's
// containerFetch can return a 200 with empty/streaming body if the gateway's
// HTTP handler hasn't fully initialized. Show the loading page instead
// of a blank page that the user would be stuck on forever.
if (acceptsHtml) {
Expand Down
24 changes: 10 additions & 14 deletions src/routes/public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,23 +50,19 @@ publicRoutes.get('/api/status', async (c) => {
console.error('[api/status] Restore failed:', restoreError);
}

// Start the gateway synchronously with a short timeout. Workers have a
// 30s CPU limit — restoreIfNeeded uses ~1-3s, leaving ~25s for the
// gateway. If it doesn't start in time, the loading page retries.
// We use synchronous start instead of waitUntil because waitUntil is
// unreliable in the Durable Object context.
// Start the gateway but DON'T wait for it to be ready.
// ensureGateway with waitForReady:false just starts the process
// (fast RPC, ~2-5s) without blocking on waitForPort (which takes
// up to 180s and would exceed the 30s Worker CPU limit).
// The loading page polls every 2s — subsequent polls will find
// the process and check if the port is up.
console.log('[api/status] No process found, starting gateway...');
try {
await Promise.race([
ensureGateway(sandbox, c.env),
new Promise((_, reject) => setTimeout(() => reject(new Error('timeout')), 25_000)),
]);
process = await findExistingGatewayProcess(sandbox);
if (process) {
return c.json({ ok: true, status: 'running', processId: process.id });
}
await ensureGateway(sandbox, c.env, { waitForReady: false });
} catch (err) {
console.log('[api/status] Gateway start timed out or failed, will retry on next poll');
const msg = err instanceof Error ? err.message : String(err);
console.error('[api/status] Gateway start failed:', msg);
return c.json({ ok: false, status: 'start_failed', error: msg, restoreError });
}
return c.json({ ok: false, status: 'starting', restoreError });
}
Expand Down
14 changes: 10 additions & 4 deletions test/e2e/zzz_cron_wake.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,16 @@ verify container is down
%require
===
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
# Check via debug/processes (does NOT trigger gateway restart like /api/status does)
sleep 3
PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}')
COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0")
# Poll until no running processes (destroy may take a few seconds)
for i in $(seq 1 10); do
PROCS=$(./curl-auth -s "$WORKER_URL/debug/processes" 2>/dev/null || echo '{"processes":[]}')
COUNT=$(echo "$PROCS" | jq '[.processes[] | select(.status == "running")] | length' 2>/dev/null || echo "0")
if [ "$COUNT" = "0" ]; then
echo "{\"running_processes\": $COUNT}" | jq .
exit 0
fi
sleep 3
done
echo "{\"running_processes\": $COUNT}" | jq .
---
{{ result: json object }}
Expand Down
Loading