diff --git a/ORPHAN_PROCESS_CLEANUP_IMPLEMENTATION.md b/ORPHAN_PROCESS_CLEANUP_IMPLEMENTATION.md new file mode 100644 index 0000000000..7bf862f0bc --- /dev/null +++ b/ORPHAN_PROCESS_CLEANUP_IMPLEMENTATION.md @@ -0,0 +1,182 @@ +# Orphan Process Cleanup Implementation + +## Overview + +This document describes the implementation of a sophisticated orphan process cleanup mechanism for the Deepnote server starter that prevents terminating active servers from other VS Code windows. + +## Problem Statement + +Previously, the cleanup logic in `cleanupOrphanedProcesses()` would force-kill **every** process matching "deepnote_toolkit server", which could terminate active servers from other VS Code windows, causing disruption to users working in multiple windows. + +## Solution + +The new implementation uses a lock file system combined with parent process verification to only kill genuine orphan processes. + +## Key Components + +### 1. Session Management + +- **Session ID**: Each VS Code window instance generates a unique session ID using `generateUuid()` +- **Lock File Directory**: Lock files are stored in `${os.tmpdir()}/vscode-deepnote-locks/` +- **Lock File Format**: JSON files named `server-{pid}.json` containing: + ```typescript + interface ServerLockFile { + sessionId: string; // Unique ID for the VS Code window + pid: number; // Process ID of the server + timestamp: number; // When the server was started + } + ``` + +### 2. Lock File Lifecycle + +#### Creation + +- When a server starts successfully, a lock file is written with the server's PID and current session ID +- Location: `writeLockFile()` called in `startServerImpl()` after the server process is spawned + +#### Deletion + +- Lock files are deleted when: + 1. The server is explicitly stopped via `stopServerImpl()` + 2. The extension is disposed and all servers are shut down + 3. An orphaned process is successfully killed during cleanup + +### 3. Orphan Detection Logic + +The `isProcessOrphaned()` method checks if a process is truly orphaned by verifying its parent process: + +#### Unix/Linux/macOS + +```bash +# Get parent process ID +ps -o ppid= -p + +# Check if parent exists (using -o pid= to get only PID with no header) +ps -p -o pid= +``` + +- If PPID is 1 (init/systemd), the process is orphaned +- If parent process doesn't exist (empty stdout from `ps -o pid=`), the process is orphaned +- The `-o pid=` flag ensures no header is printed, so empty output reliably indicates a missing process + +#### Windows + +```cmd +# Get parent process ID +wmic process where ProcessId= get ParentProcessId + +# Check if parent exists +tasklist /FI "PID eq " /FO CSV /NH +``` + +- If parent process doesn't exist or PPID is 0, the process is orphaned + +### 4. Cleanup Decision Flow + +When `cleanupOrphanedProcesses()` runs (at extension startup): + +1. **Find all deepnote_toolkit server processes** + + - Use `ps aux` (Unix) or `tasklist` (Windows) + - Extract PIDs of matching processes + +2. **For each candidate PID:** + + a. **Check for lock file** + + - If lock file exists: + + - If session ID matches current session → **SKIP** (shouldn't happen at startup) + - If session ID differs: + - Check if process is orphaned + - If orphaned → **KILL** + - If not orphaned → **SKIP** (active in another window) + + - If no lock file exists: + - Check if process is orphaned + - If orphaned → **KILL** + - If not orphaned → **SKIP** (might be from older version without lock files) + +3. **Kill orphaned processes** + + - Use `kill -9` (Unix) or `taskkill /F /T` (Windows) + - Delete lock file after successful kill + +4. **Log all decisions** + - Processes to kill: logged with reason + - Processes to skip: logged with reason + - Provides full audit trail for debugging + +## Code Changes + +### Modified Files + +- `src/kernels/deepnote/deepnoteServerStarter.node.ts` + +### New Imports + +```typescript +import { IExtensionSyncActivationService } from '../../platform/activation/types'; +import * as fs from 'fs-extra'; +import * as os from 'os'; +import * as path from '../../platform/vscode-path/path'; +import { generateUuid } from '../../platform/common/uuid'; +``` + +### New Class Members + +```typescript +private readonly sessionId: string = generateUuid(); +private readonly lockFileDir: string = path.join(os.tmpdir(), 'vscode-deepnote-locks'); +``` + +### New Methods + +1. `initializeLockFileDirectory()` - Creates lock file directory +2. `getLockFilePath(pid)` - Returns path to lock file for a PID +3. `writeLockFile(pid)` - Creates lock file for a server process +4. `readLockFile(pid)` - Reads lock file data +5. `deleteLockFile(pid)` - Removes lock file +6. `isProcessOrphaned(pid)` - Checks if process is orphaned by verifying parent + +### Modified Methods + +1. `constructor()` - Minimal initialization (dependency injection only) +2. `activate()` - Initializes lock file directory and triggers cleanup (implements IExtensionSyncActivationService) +3. `startServerImpl()` - Writes lock file after server starts +4. `stopServerImpl()` - Deletes lock file when server stops +5. `dispose()` - Deletes lock files for all stopped servers +6. `cleanupOrphanedProcesses()` - Implements sophisticated orphan detection + +## Benefits + +1. **Multi-Window Safety**: Active servers in other VS Code windows are never killed +2. **Backward Compatible**: Handles processes from older versions without lock files +3. **Robust Orphan Detection**: Uses OS-level parent process verification +4. **Full Audit Trail**: Comprehensive logging of all cleanup decisions +5. **Automatic Cleanup**: Stale lock files are removed when processes are killed +6. **Session Isolation**: Each VS Code window operates independently + +## Testing Recommendations + +1. **Single Window**: Verify servers start and stop correctly +2. **Multiple Windows**: Open multiple VS Code windows with Deepnote files, verify servers in other windows aren't killed +3. **Orphan Cleanup**: Kill VS Code process forcefully, restart, verify orphaned servers are cleaned up +4. **Lock File Cleanup**: Verify lock files are created and deleted appropriately +5. **Cross-Platform**: Test on Windows, macOS, and Linux + +## Edge Cases Handled + +1. **No lock file + active parent**: Process is skipped (might be from older version) +2. **Lock file + different session + active parent**: Process is skipped (active in another window) +3. **Lock file + same session**: Process is skipped (shouldn't happen at startup) +4. **No lock file + orphaned**: Process is killed (genuine orphan) +5. **Lock file + different session + orphaned**: Process is killed (orphaned from crashed window) +6. **Failed parent check**: Process is assumed not orphaned (safer default) + +## Future Enhancements + +1. **Stale Lock File Cleanup**: Periodically clean up lock files for non-existent processes +2. **Lock File Expiry**: Add TTL to lock files to handle edge cases +3. **Health Monitoring**: Periodic checks to ensure servers are still responsive +4. **Graceful Shutdown**: Try SIGTERM before SIGKILL for orphaned processes diff --git a/src/kernels/deepnote/deepnoteServerStarter.node.ts b/src/kernels/deepnote/deepnoteServerStarter.node.ts index cf4ee4e34f..b47a74bbd0 100644 --- a/src/kernels/deepnote/deepnoteServerStarter.node.ts +++ b/src/kernels/deepnote/deepnoteServerStarter.node.ts @@ -7,29 +7,63 @@ import { PythonEnvironment } from '../../platform/pythonEnvironments/info'; import { IDeepnoteServerStarter, IDeepnoteToolkitInstaller, DeepnoteServerInfo, DEEPNOTE_DEFAULT_PORT } from './types'; import { IProcessServiceFactory, ObservableExecutionResult } from '../../platform/common/process/types.node'; import { logger } from '../../platform/logging'; -import { IOutputChannel, IDisposable, IHttpClient } from '../../platform/common/types'; +import { IOutputChannel, IDisposable, IHttpClient, IAsyncDisposableRegistry } from '../../platform/common/types'; import { STANDARD_OUTPUT_CHANNEL } from '../../platform/common/constants'; import { sleep } from '../../platform/common/utils/async'; import { Cancellation, raceCancellationError } from '../../platform/common/cancellation'; +import { IExtensionSyncActivationService } from '../../platform/activation/types'; import getPort from 'get-port'; +import * as fs from 'fs-extra'; +import * as os from 'os'; +import * as path from '../../platform/vscode-path/path'; +import { generateUuid } from '../../platform/common/uuid'; + +/** + * Lock file data structure for tracking server ownership + */ +interface ServerLockFile { + sessionId: string; + pid: number; + timestamp: number; +} /** * Starts and manages the deepnote-toolkit Jupyter server. */ @injectable() -export class DeepnoteServerStarter implements IDeepnoteServerStarter { +export class DeepnoteServerStarter implements IDeepnoteServerStarter, IExtensionSyncActivationService { private readonly serverProcesses: Map> = new Map(); private readonly serverInfos: Map = new Map(); private readonly disposablesByFile: Map = new Map(); // Track in-flight operations per file to prevent concurrent start/stop private readonly pendingOperations: Map> = new Map(); + // Unique session ID for this VS Code window instance + private readonly sessionId: string = generateUuid(); + // Directory for lock files + private readonly lockFileDir: string = path.join(os.tmpdir(), 'vscode-deepnote-locks'); constructor( @inject(IProcessServiceFactory) private readonly processServiceFactory: IProcessServiceFactory, @inject(IDeepnoteToolkitInstaller) private readonly toolkitInstaller: IDeepnoteToolkitInstaller, @inject(IOutputChannel) @named(STANDARD_OUTPUT_CHANNEL) private readonly outputChannel: IOutputChannel, - @inject(IHttpClient) private readonly httpClient: IHttpClient - ) {} + @inject(IHttpClient) private readonly httpClient: IHttpClient, + @inject(IAsyncDisposableRegistry) asyncRegistry: IAsyncDisposableRegistry + ) { + // Register for disposal when the extension deactivates + asyncRegistry.push(this); + } + + public activate(): void { + // Ensure lock file directory exists + this.initializeLockFileDirectory().catch((ex) => { + logger.warn(`Failed to initialize lock file directory: ${ex}`); + }); + + // Clean up any orphaned deepnote-toolkit processes from previous sessions + this.cleanupOrphanedProcesses().catch((ex) => { + logger.warn(`Failed to cleanup orphaned processes: ${ex}`); + }); + } public async getOrStartServer( interpreter: PythonEnvironment, @@ -152,6 +186,14 @@ export class DeepnoteServerStarter implements IDeepnoteServerStarter { const serverInfo = { url, port }; this.serverInfos.set(fileKey, serverInfo); + // Write lock file for the server process + const serverPid = serverProcess.proc?.pid; + if (serverPid) { + await this.writeLockFile(serverPid); + } else { + logger.warn(`Could not get PID for server process for ${fileKey}`); + } + try { const serverReady = await this.waitForServer(serverInfo, 120000, token); if (!serverReady) { @@ -203,12 +245,19 @@ export class DeepnoteServerStarter implements IDeepnoteServerStarter { const serverProcess = this.serverProcesses.get(fileKey); if (serverProcess) { + const serverPid = serverProcess.proc?.pid; + try { logger.info(`Stopping Deepnote server for ${fileKey}...`); serverProcess.proc?.kill(); this.serverProcesses.delete(fileKey); this.serverInfos.delete(fileKey); this.outputChannel.appendLine(`Deepnote server stopped for ${fileKey}`); + + // Clean up lock file after stopping the server + if (serverPid) { + await this.deleteLockFile(serverPid); + } } catch (ex) { logger.error(`Error stopping Deepnote server: ${ex}`); } @@ -247,19 +296,68 @@ export class DeepnoteServerStarter implements IDeepnoteServerStarter { } } - public dispose(): void { + public async dispose(): Promise { logger.info('Disposing DeepnoteServerStarter - stopping all servers...'); - // Stop all server processes + // Wait for any pending operations to complete (with timeout) + const pendingOps = Array.from(this.pendingOperations.values()); + if (pendingOps.length > 0) { + logger.info(`Waiting for ${pendingOps.length} pending operations to complete...`); + await Promise.allSettled(pendingOps.map((op) => Promise.race([op, sleep(2000)]))); + } + + // Stop all server processes and wait for them to exit + const killPromises: Promise[] = []; + const pidsToCleanup: number[] = []; + for (const [fileKey, serverProcess] of this.serverProcesses.entries()) { try { logger.info(`Stopping Deepnote server for ${fileKey}...`); - serverProcess.proc?.kill(); + const proc = serverProcess.proc; + if (proc && !proc.killed) { + const serverPid = proc.pid; + if (serverPid) { + pidsToCleanup.push(serverPid); + } + + // Create a promise that resolves when the process exits + const exitPromise = new Promise((resolve) => { + const timeout = setTimeout(() => { + logger.warn(`Process for ${fileKey} did not exit gracefully, force killing...`); + try { + proc.kill('SIGKILL'); + } catch { + // Ignore errors on force kill + } + resolve(); + }, 3000); // Wait up to 3 seconds for graceful exit + + proc.once('exit', () => { + clearTimeout(timeout); + resolve(); + }); + }); + + // Send SIGTERM for graceful shutdown + proc.kill('SIGTERM'); + killPromises.push(exitPromise); + } } catch (ex) { logger.error(`Error stopping Deepnote server for ${fileKey}: ${ex}`); } } + // Wait for all processes to exit + if (killPromises.length > 0) { + logger.info(`Waiting for ${killPromises.length} server processes to exit...`); + await Promise.allSettled(killPromises); + } + + // Clean up lock files for all stopped processes + for (const pid of pidsToCleanup) { + await this.deleteLockFile(pid); + } + // Dispose all tracked disposables for (const [fileKey, disposables] of this.disposablesByFile.entries()) { try { @@ -277,4 +375,296 @@ export class DeepnoteServerStarter implements IDeepnoteServerStarter { logger.info('DeepnoteServerStarter disposed successfully'); } + + /** + * Initialize the lock file directory + */ + private async initializeLockFileDirectory(): Promise { + try { + await fs.ensureDir(this.lockFileDir); + logger.info(`Lock file directory initialized at ${this.lockFileDir} with session ID ${this.sessionId}`); + } catch (ex) { + logger.error(`Failed to create lock file directory: ${ex}`); + } + } + + /** + * Get the lock file path for a given PID + */ + private getLockFilePath(pid: number): string { + return path.join(this.lockFileDir, `server-${pid}.json`); + } + + /** + * Write a lock file for a server process + */ + private async writeLockFile(pid: number): Promise { + try { + const lockData: ServerLockFile = { + sessionId: this.sessionId, + pid, + timestamp: Date.now() + }; + const lockFilePath = this.getLockFilePath(pid); + await fs.writeJson(lockFilePath, lockData, { spaces: 2 }); + logger.info(`Created lock file for PID ${pid} with session ID ${this.sessionId}`); + } catch (ex) { + logger.warn(`Failed to write lock file for PID ${pid}: ${ex}`); + } + } + + /** + * Read a lock file for a given PID + */ + private async readLockFile(pid: number): Promise { + try { + const lockFilePath = this.getLockFilePath(pid); + if (await fs.pathExists(lockFilePath)) { + return await fs.readJson(lockFilePath); + } + } catch (ex) { + logger.warn(`Failed to read lock file for PID ${pid}: ${ex}`); + } + return null; + } + + /** + * Delete a lock file for a given PID + */ + private async deleteLockFile(pid: number): Promise { + try { + const lockFilePath = this.getLockFilePath(pid); + if (await fs.pathExists(lockFilePath)) { + await fs.remove(lockFilePath); + logger.info(`Deleted lock file for PID ${pid}`); + } + } catch (ex) { + logger.warn(`Failed to delete lock file for PID ${pid}: ${ex}`); + } + } + + /** + * Check if a process is orphaned by verifying its parent process + */ + private async isProcessOrphaned(pid: number): Promise { + try { + const processService = await this.processServiceFactory.create(undefined); + + if (process.platform === 'win32') { + // Windows: use WMIC to get parent process ID + const result = await processService.exec( + 'wmic', + ['process', 'where', `ProcessId=${pid}`, 'get', 'ParentProcessId'], + { throwOnStdErr: false } + ); + + if (result.stdout) { + const lines = result.stdout + .split('\n') + .filter((line) => line.trim() && !line.includes('ParentProcessId')); + if (lines.length > 0) { + const ppid = parseInt(lines[0].trim(), 10); + if (!isNaN(ppid)) { + // PPID of 0 means orphaned + if (ppid === 0) { + return true; + } + + // Check if parent process exists + const parentCheck = await processService.exec( + 'tasklist', + ['/FI', `PID eq ${ppid}`, '/FO', 'CSV', '/NH'], + { throwOnStdErr: false } + ); + + // Normalize and check stdout + const stdout = (parentCheck.stdout || '').trim(); + + // Parent is missing if: + // 1. stdout is empty + // 2. stdout starts with "INFO:" (case-insensitive) + // 3. stdout contains "no tasks are running" (case-insensitive) + if (stdout.length === 0 || /^INFO:/i.test(stdout) || /no tasks are running/i.test(stdout)) { + return true; // Parent missing, process is orphaned + } + + // Parent exists + return false; + } + } + } + } else { + // Unix: use ps to get parent process ID + const result = await processService.exec('ps', ['-o', 'ppid=', '-p', pid.toString()], { + throwOnStdErr: false + }); + + if (result.stdout) { + const ppid = parseInt(result.stdout.trim(), 10); + if (!isNaN(ppid)) { + // PPID of 1 typically means orphaned (adopted by init/systemd) + if (ppid === 1) { + return true; + } + // Check if parent process exists + const parentCheck = await processService.exec('ps', ['-p', ppid.toString(), '-o', 'pid='], { + throwOnStdErr: false + }); + return parentCheck.stdout.trim().length === 0; + } + } + } + } catch (ex) { + logger.warn(`Failed to check if process ${pid} is orphaned: ${ex}`); + } + + // If we can't determine, assume it's not orphaned (safer) + return false; + } + + /** + * Cleans up any orphaned deepnote-toolkit processes from previous VS Code sessions. + * This prevents port conflicts when starting new servers. + */ + private async cleanupOrphanedProcesses(): Promise { + try { + logger.info('Checking for orphaned deepnote-toolkit processes...'); + const processService = await this.processServiceFactory.create(undefined); + + // Find all deepnote-toolkit server processes + let command: string; + let args: string[]; + + if (process.platform === 'win32') { + // Windows: use tasklist and findstr + command = 'tasklist'; + args = ['/FI', 'IMAGENAME eq python.exe', '/FO', 'CSV', '/NH']; + } else { + // Unix-like: use ps and grep + command = 'ps'; + args = ['aux']; + } + + const result = await processService.exec(command, args, { throwOnStdErr: false }); + + if (result.stdout) { + const lines = result.stdout.split('\n'); + const candidatePids: number[] = []; + + for (const line of lines) { + // Look for processes running deepnote_toolkit server + if (line.includes('deepnote_toolkit') && line.includes('server')) { + // Extract PID based on platform + let pid: number | undefined; + + if (process.platform === 'win32') { + // Windows CSV format: "python.exe","12345",... + const match = line.match(/"python\.exe","(\d+)"/); + if (match) { + pid = parseInt(match[1], 10); + } + } else { + // Unix format: user PID ... + const parts = line.trim().split(/\s+/); + if (parts.length > 1) { + pid = parseInt(parts[1], 10); + } + } + + if (pid && !isNaN(pid)) { + candidatePids.push(pid); + } + } + } + + if (candidatePids.length > 0) { + logger.info( + `Found ${candidatePids.length} deepnote-toolkit server process(es): ${candidatePids.join(', ')}` + ); + + const pidsToKill: number[] = []; + const pidsToSkip: Array<{ pid: number; reason: string }> = []; + + // Check each process to determine if it should be killed + for (const pid of candidatePids) { + // Check if there's a lock file for this PID + const lockData = await this.readLockFile(pid); + + if (lockData) { + // Lock file exists - check if it belongs to a different session + if (lockData.sessionId !== this.sessionId) { + // Different session - check if the process is actually orphaned + const isOrphaned = await this.isProcessOrphaned(pid); + if (isOrphaned) { + logger.info( + `PID ${pid} belongs to session ${lockData.sessionId} and is orphaned - will kill` + ); + pidsToKill.push(pid); + } else { + pidsToSkip.push({ + pid, + reason: `belongs to active session ${lockData.sessionId.substring(0, 8)}...` + }); + } + } else { + // Same session - this shouldn't happen during startup, but skip it + pidsToSkip.push({ pid, reason: 'belongs to current session' }); + } + } else { + // No lock file - check if orphaned before killing + const isOrphaned = await this.isProcessOrphaned(pid); + if (isOrphaned) { + logger.info(`PID ${pid} has no lock file and is orphaned - will kill`); + pidsToKill.push(pid); + } else { + pidsToSkip.push({ pid, reason: 'no lock file but has active parent process' }); + } + } + } + + // Log skipped processes + if (pidsToSkip.length > 0) { + for (const { pid, reason } of pidsToSkip) { + logger.info(`Skipping PID ${pid}: ${reason}`); + } + } + + // Kill orphaned processes + if (pidsToKill.length > 0) { + logger.info(`Killing ${pidsToKill.length} orphaned process(es): ${pidsToKill.join(', ')}`); + this.outputChannel.appendLine( + `Cleaning up ${pidsToKill.length} orphaned deepnote-toolkit process(es)...` + ); + + for (const pid of pidsToKill) { + try { + if (process.platform === 'win32') { + await processService.exec('taskkill', ['/F', '/T', '/PID', pid.toString()], { + throwOnStdErr: false + }); + } else { + await processService.exec('kill', ['-9', pid.toString()], { throwOnStdErr: false }); + } + logger.info(`Killed orphaned process ${pid}`); + + // Clean up the lock file after killing + await this.deleteLockFile(pid); + } catch (ex) { + logger.warn(`Failed to kill process ${pid}: ${ex}`); + } + } + + this.outputChannel.appendLine('✓ Cleanup complete'); + } else { + logger.info('No orphaned deepnote-toolkit processes found (all processes are active)'); + } + } else { + logger.info('No deepnote-toolkit server processes found'); + } + } + } catch (ex) { + // Don't fail startup if cleanup fails + logger.warn(`Error during orphaned process cleanup: ${ex}`); + } + } } diff --git a/src/kernels/deepnote/types.ts b/src/kernels/deepnote/types.ts index 2d74fdad57..3114669b90 100644 --- a/src/kernels/deepnote/types.ts +++ b/src/kernels/deepnote/types.ts @@ -88,7 +88,7 @@ export interface IDeepnoteToolkitInstaller { } export const IDeepnoteServerStarter = Symbol('IDeepnoteServerStarter'); -export interface IDeepnoteServerStarter extends vscode.Disposable { +export interface IDeepnoteServerStarter { /** * Starts or gets an existing deepnote-toolkit Jupyter server. * @param interpreter The Python interpreter to use @@ -107,6 +107,12 @@ export interface IDeepnoteServerStarter extends vscode.Disposable { * @param deepnoteFileUri The URI of the .deepnote file */ stopServer(deepnoteFileUri: vscode.Uri): Promise; + + /** + * Disposes all server processes and resources. + * Called when the extension is deactivated. + */ + dispose(): Promise; } export interface DeepnoteServerInfo { diff --git a/src/notebooks/serviceRegistry.node.ts b/src/notebooks/serviceRegistry.node.ts index d0e20c2491..391aaf4679 100644 --- a/src/notebooks/serviceRegistry.node.ts +++ b/src/notebooks/serviceRegistry.node.ts @@ -130,6 +130,7 @@ export function registerTypes(serviceManager: IServiceManager, isDevMode: boolea // Deepnote kernel services serviceManager.addSingleton(IDeepnoteToolkitInstaller, DeepnoteToolkitInstaller); serviceManager.addSingleton(IDeepnoteServerStarter, DeepnoteServerStarter); + serviceManager.addBinding(IDeepnoteServerStarter, IExtensionSyncActivationService); serviceManager.addSingleton(IDeepnoteServerProvider, DeepnoteServerProvider); serviceManager.addBinding(IDeepnoteServerProvider, IExtensionSyncActivationService); serviceManager.addSingleton(IDeepnoteKernelAutoSelector, DeepnoteKernelAutoSelector);