diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cff5ed7c..2f8547379 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Fixes +- On Windows, CodeGraph's background processes no longer pile up without bound and saturate CPU over a long session. When the editor or agent that launched CodeGraph exited, its helper process couldn't tell its parent had gone — Windows reports process lineage differently than macOS and Linux — so the helper kept running, the shared background server never saw the client disconnect, and its idle timer never fired to shut it down. CodeGraph now detects parent-process exit directly on Windows, so helpers and the idle background server wind down promptly, the same as they already did on macOS and Linux. (#692, #576, #680) - React Native native→JS events now connect through the common `sendEvent(context, "X", body)` wrapper. Many libraries (react-native-device-info and others) wrap the event emitter behind a helper whose `.emit(eventName, …)` takes a *variable*, so the matcher — which looked for `.emit("literal", …)` — missed it; the literal event name actually lives in the wrapper call. Now a native method that fires `sendEvent(…, "batteryLevelChanged", …)` links to the JS `addListener('batteryLevelChanged', …)` handler, so editing the native emitter surfaces the JS subscriber. (React Native) - React Native / Expo cross-language bridges are more complete and more precise. An Expo Module method declared with a generic type — Android's `AsyncFunction("getBatteryLevelAsync")` — is now indexed (the `` used to defeat the matcher, so every Android Expo method was dropped and a JS call resolved only to the iOS Swift impl). The iOS and Android implementations of the same JS-visible method — both Expo Modules and classic NativeModules (`@ReactMethod` on Android, the matching method on iOS) — are now linked to each other, so a JS call that resolves to one platform still reaches the other and editing either platform's native code surfaces the JS caller. And a `Type.member` static read in native code (e.g. Android's `BatteryManager.EXTRA_LEVEL`) no longer falsely links to a coincidentally same-named class in another language (a web `BatteryManager`) — type references stay within a language family, while genuine cross-language bridges (config→code, JS↔native calls) are unaffected. (React Native, Expo) - A TypeScript/JavaScript reference or import no longer gets mis-linked to a same-named class in a native language. In a React Native / Expo repo that has both a TypeScript `TestRunner` type and a Kotlin `TestRunner` class, a TS reference to `TestRunner` — or an `import React` sitting next to a Swift `React` — used to resolve onto the native symbol (the component resolver matched any same-named class regardless of language, and import statements weren't language-checked at all). References and imports now stay within their language family, so they land on the right symbol while genuine cross-language bridges (JS↔native calls, config→code) are untouched. A C/C++ `#include "Foo.h"` likewise no longer resolves to a same-named header from another platform (an iOS Objective-C `Foo.h`). (React Native, Expo, TypeScript, C/C++) diff --git a/__tests__/ppid-watchdog.test.ts b/__tests__/ppid-watchdog.test.ts new file mode 100644 index 000000000..ba270fa7d --- /dev/null +++ b/__tests__/ppid-watchdog.test.ts @@ -0,0 +1,138 @@ +/** + * Unit coverage for the PPID-watchdog decision logic (#277, #692). + * + * The live watchdog timers in `proxy.ts` / `index.ts` are integration-tested on + * POSIX in `mcp-ppid-watchdog.test.ts`, but that test is skipped on Windows + * (`process.kill(pid, 'SIGKILL')` and reparenting are POSIX-specific). That gap + * is exactly how the Windows leak (#692) shipped: on Windows `process.ppid` + * never changes when the parent dies, so the old change-only check could never + * fire. These pure-function tests exercise the Windows branch on any OS by + * stubbing `isAlive` and `platform`. + */ +import { describe, it, expect } from 'vitest'; +import { supervisionLostReason } from '../src/mcp/ppid-watchdog'; + +const alive = () => true; +const dead = () => false; +/** Alive for everyone except the listed pids. */ +const deadOnly = (...pids: number[]) => (pid: number) => !pids.includes(pid); + +describe('supervisionLostReason', () => { + describe('POSIX (parent death reparents → ppid changes)', () => { + it('returns null while the parent is unchanged', () => { + expect( + supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, + hostPpid: null, + isAlive: alive, + platform: 'linux', + }), + ).toBeNull(); + }); + + it('detects a reparent (ppid divergence) as the death signal', () => { + const reason = supervisionLostReason({ + originalPpid: 100, + currentPpid: 1, // reparented to init + hostPpid: null, + isAlive: alive, + platform: 'linux', + }); + expect(reason).toBe('ppid 100 -> 1'); + }); + + it('does NOT use liveness on POSIX — a dead original ppid is not orphaning', () => { + // A double-forked grandparent can die while we stay correctly parented. + // POSIX must rely on the change-check only, or it would false-positive. + expect( + supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, + hostPpid: null, + isAlive: dead, + platform: 'linux', + }), + ).toBeNull(); + }); + }); + + describe('Windows (ppid is stable across parent death → poll liveness)', () => { + it('returns null while the original parent is still alive', () => { + expect( + supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, + hostPpid: null, + isAlive: alive, + platform: 'win32', + }), + ).toBeNull(); + }); + + it('detects parent death by liveness even though ppid is unchanged (the #692 fix)', () => { + const reason = supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, // Windows never reparents + hostPpid: null, + isAlive: deadOnly(100), + platform: 'win32', + }); + expect(reason).toBe('parent pid 100 exited'); + }); + + it('ignores pid 0/1 — never a real Windows parent, must not trigger shutdown', () => { + for (const ppid of [0, 1]) { + expect( + supervisionLostReason({ + originalPpid: ppid, + currentPpid: ppid, + hostPpid: null, + isAlive: dead, + platform: 'win32', + }), + ).toBeNull(); + } + }); + }); + + describe('threaded host pid (reached past an intermediate launcher shim)', () => { + it('shuts down when the host pid is gone, on either platform', () => { + for (const platform of ['linux', 'win32'] as const) { + const reason = supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, + hostPpid: 42, + isAlive: deadOnly(42), // shim 100 alive, host 42 dead + platform, + }); + expect(reason).toBe('host pid 42 exited'); + } + }); + + it('stays supervised while the host pid is alive', () => { + expect( + supervisionLostReason({ + originalPpid: 100, + currentPpid: 100, + hostPpid: 42, + isAlive: alive, + platform: 'linux', + }), + ).toBeNull(); + }); + }); + + describe('signal precedence', () => { + it('reports the ppid change ahead of a host-gone reason', () => { + const reason = supervisionLostReason({ + originalPpid: 100, + currentPpid: 1, + hostPpid: 42, + isAlive: dead, + platform: 'linux', + }); + expect(reason).toBe('ppid 100 -> 1'); + }); + }); +}); diff --git a/src/mcp/index.ts b/src/mcp/index.ts index 660687491..fa939dfbb 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -49,6 +49,7 @@ import { } from './daemon'; import { connectWithHello, runLocalHandshakeProxy } from './proxy'; import { getDaemonSocketPath } from './daemon-paths'; +import { supervisionLostReason } from './ppid-watchdog'; import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; /** @@ -423,13 +424,13 @@ export class MCPServer { const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS); if (pollMs <= 0) return; this.ppidWatchdog = setInterval(() => { - const current = process.ppid; - const ppidChanged = current !== this.originalPpid; - const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid); - if (ppidChanged || hostGone) { - const reason = ppidChanged - ? `ppid ${this.originalPpid} -> ${current}` - : `host pid ${this.hostPpid} exited`; + const reason = supervisionLostReason({ + originalPpid: this.originalPpid, + currentPpid: process.ppid, + hostPpid: this.hostPpid, + isAlive: isProcessAlive, + }); + if (reason) { process.stderr.write( `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n` ); diff --git a/src/mcp/ppid-watchdog.ts b/src/mcp/ppid-watchdog.ts new file mode 100644 index 000000000..eaf4d250b --- /dev/null +++ b/src/mcp/ppid-watchdog.ts @@ -0,0 +1,63 @@ +/** + * Shared decision logic for the PPID watchdog (#277, #692). + * + * The watchdog's job: notice that the process we depend on — our parent, or the + * MCP host reached past an intermediate launcher — has died, so an orphaned + * proxy / direct server shuts itself down instead of leaking forever. + * + * Parent death surfaces differently per OS, and getting this wrong is what + * caused the unbounded daemon/proxy leak on Windows (#692, #576): + * + * - **POSIX** reparents an orphan to init (pid 1), so `process.ppid` *changes* + * the instant the parent dies. That divergence is the classic #277 signal. + * - **Windows** never reparents: `process.ppid` keeps reporting the original + * (now-dead) parent forever, so the change-check can never fire. There we + * must poll the original parent's *liveness* instead. + * + * The liveness fallback is deliberately gated to Windows. On POSIX a + * double-forked grandparent can legitimately outlive the reparent, so a dead + * `originalPpid` is not proof of orphaning there — the change-check is the + * correct and sufficient POSIX signal, and using liveness too would risk a + * false-positive shutdown. + */ +export interface SupervisionState { + /** `process.ppid` captured at startup. */ + originalPpid: number; + /** `process.ppid` right now. */ + currentPpid: number; + /** + * The MCP host pid threaded past an intermediate launcher + * (`CODEGRAPH_HOST_PPID`), or null when unknown — e.g. the standalone bundle, + * which pre-bakes `--liftoff-only` and so never runs the relaunch that sets it. + */ + hostPpid: number | null; + /** Liveness probe — `process.kill(pid, 0)` in production, stubbed in tests. */ + isAlive: (pid: number) => boolean; + /** Defaults to `process.platform`. */ + platform?: NodeJS.Platform; +} + +/** + * Returns a human-readable reason string when the process has lost its + * supervisor and should shut down, or null while it is still supervised. + */ +export function supervisionLostReason(state: SupervisionState): string | null { + const { originalPpid, currentPpid, hostPpid, isAlive } = state; + const platform = state.platform ?? process.platform; + + // POSIX: the parent dying reparents us, so ppid diverges. (Never on Windows.) + if (currentPpid !== originalPpid) { + return `ppid ${originalPpid} -> ${currentPpid}`; + } + // Windows: ppid is stable across parent death, so detect it by liveness. + // Skip pid 0/1 — "unknown" and init are never a real Windows parent, and a + // bogus liveness probe there must not trigger a shutdown. + if (platform === 'win32' && originalPpid > 1 && !isAlive(originalPpid)) { + return `parent pid ${originalPpid} exited`; + } + // Either platform: the host pid threaded past a launcher shim is gone. + if (hostPpid !== null && !isAlive(hostPpid)) { + return `host pid ${hostPpid} exited`; + } + return null; +} diff --git a/src/mcp/proxy.ts b/src/mcp/proxy.ts index ceea34a67..8517f50d7 100644 --- a/src/mcp/proxy.ts +++ b/src/mcp/proxy.ts @@ -22,6 +22,7 @@ import * as fs from 'fs'; import * as net from 'net'; import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags'; import { DaemonHello, MAX_HELLO_LINE_BYTES } from './daemon'; +import { supervisionLostReason } from './ppid-watchdog'; import { CodeGraphPackageVersion } from './version'; import { SERVER_INFO, PROTOCOL_VERSION } from './session'; import { SERVER_INSTRUCTIONS } from './server-instructions'; @@ -292,8 +293,14 @@ function startPpidWatchdogNoSocket(onDeath: () => void): void { const originalPpid = process.ppid; const hostPpid = parseHostPpid(process.env[HOST_PPID_ENV]); const timer = setInterval(() => { - if (process.ppid !== originalPpid || (hostPpid !== null && !isProcessAliveLocal(hostPpid))) { - process.stderr.write('[CodeGraph MCP] Parent process exited; shutting down.\n'); + const reason = supervisionLostReason({ + originalPpid, + currentPpid: process.ppid, + hostPpid, + isAlive: isProcessAliveLocal, + }); + if (reason) { + process.stderr.write(`[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`); onDeath(); } }, pollMs); @@ -408,13 +415,13 @@ function startPpidWatchdog(socket: net.Socket): void { const originalPpid = process.ppid; const hostPpid = parseHostPpid(process.env[HOST_PPID_ENV]); const timer = setInterval(() => { - const current = process.ppid; - const ppidChanged = current !== originalPpid; - const hostGone = hostPpid !== null && !isProcessAliveLocal(hostPpid); - if (ppidChanged || hostGone) { - const reason = ppidChanged - ? `ppid ${originalPpid} -> ${current}` - : `host pid ${hostPpid} exited`; + const reason = supervisionLostReason({ + originalPpid, + currentPpid: process.ppid, + hostPpid, + isAlive: isProcessAliveLocal, + }); + if (reason) { process.stderr.write(`[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`); try { socket.destroy(); } catch { /* ignore */ } process.exit(0);