From 50af98e2b14e723b2b18a68156bd78126927e74b Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 17:59:24 +0100 Subject: [PATCH 1/7] Add execution tracking for Jupyter cell executions - Track execution start/end with timestamps - Log execution duration and success status - Publish execution metadata to webapp - Register IPython event handlers for monitoring --- deepnote_toolkit/execution_tracking.py | 132 +++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 deepnote_toolkit/execution_tracking.py diff --git a/deepnote_toolkit/execution_tracking.py b/deepnote_toolkit/execution_tracking.py new file mode 100644 index 0000000..3afe4e5 --- /dev/null +++ b/deepnote_toolkit/execution_tracking.py @@ -0,0 +1,132 @@ +""" +This module provides execution tracking for Jupyter notebook cells. +It monitors execution start/end events, duration, and publishes metadata to help +debug stuck executions and improve observability. +""" + +import time +from typing import Any, Dict, Optional + +from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult + +from .ipython_utils import publish_execution_metadata +from .logging import LoggerManager + + +class ExecutionTracker: + """Tracks execution state of notebook cells.""" + + def __init__(self): + self.logger = LoggerManager.get_logger("execution_tracker") + self.current_execution: Optional[Dict[str, Any]] = None + self.execution_count = 0 + + def on_pre_execute(self, info: ExecutionInfo) -> None: + """ + Called before executing a cell. + Logs execution start and tracks execution metadata. + """ + self.execution_count += 1 + cell_preview = info.raw_cell[:100] if info.raw_cell else "" + cell_id = hash(info.raw_cell) if info.raw_cell else 0 + + self.current_execution = { + "cell_id": cell_id, + "code_preview": cell_preview, + "start_time": time.time(), + "execution_count": self.execution_count, + } + + self.logger.info( + "EXEC_START | count=%d | cell_id=%s | preview=%s", + self.execution_count, + cell_id, + cell_preview[:50].replace("\n", "\\n"), + ) + + def on_post_execute(self, result: ExecutionResult) -> None: + """ + Called after executing a cell. + Logs execution end, duration, and success status. + Publishes execution metadata for webapp consumption. + """ + if not self.current_execution: + self.logger.warning("EXEC_END called without matching EXEC_START") + return + + duration = time.time() - self.current_execution["start_time"] + success = result.error_in_exec is None + error_name = ( + type(result.error_in_exec).__name__ if result.error_in_exec else None + ) + + self.logger.info( + "EXEC_END | count=%d | duration=%.2fs | success=%s%s", + self.current_execution["execution_count"], + duration, + success, + f" | error={error_name}" if error_name else "", + ) + + # Publish metadata to webapp + try: + publish_execution_metadata( + execution_count=self.current_execution["execution_count"], + duration=duration, + success=success, + error_type=error_name, + ) + except Exception as e: # pylint: disable=broad-exception-caught + self.logger.error("Failed to publish execution metadata: %s", e) + + self.current_execution = None + + def on_pre_run_cell(self, info: ExecutionInfo) -> None: + """Called before running a cell (before pre_execute).""" + cell_preview = info.raw_cell[:30] if info.raw_cell else "" + self.logger.debug( + "PRE_RUN | preview=%s", cell_preview.replace("\n", "\\n") + ) + + def on_post_run_cell(self, result: ExecutionResult) -> None: + """Called after running a cell (after post_execute).""" + self.logger.debug("POST_RUN | exec_count=%s", result.execution_count) + + +# Global instance +_execution_tracker: Optional[ExecutionTracker] = None + + +def setup_execution_tracking() -> None: + """ + Set up IPython event handlers for execution tracking. + This should be called during runtime initialization. + """ + global _execution_tracker # pylint: disable=global-statement + + try: + from IPython import get_ipython + + ip = get_ipython() + if ip is None: + LoggerManager.get_logger("execution_tracking").warning( + "IPython instance not available, skipping execution tracking setup" + ) + return + + _execution_tracker = ExecutionTracker() + + # Register event handlers + ip.events.register("pre_execute", _execution_tracker.on_pre_execute) + ip.events.register("post_execute", _execution_tracker.on_post_execute) + ip.events.register("pre_run_cell", _execution_tracker.on_pre_run_cell) + ip.events.register("post_run_cell", _execution_tracker.on_post_run_cell) + + LoggerManager.get_logger("execution_tracking").info( + "Execution tracking initialized successfully" + ) + + except Exception as e: # pylint: disable=broad-exception-caught + LoggerManager.get_logger("execution_tracking").error( + "Failed to set up execution tracking: %s", e + ) From c1d4ccae9f6855ee56877bebf94291a384715b56 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 17:59:34 +0100 Subject: [PATCH 2/7] Add optional execution timeout monitoring - Monitor long-running executions with configurable thresholds - Send warnings when executions exceed warning threshold - Optional auto-interrupt for stuck executions via SIGINT - Report warnings/timeouts to webapp - Configurable via environment variables: - DEEPNOTE_ENABLE_EXECUTION_TIMEOUT - DEEPNOTE_EXECUTION_WARNING_THRESHOLD (default: 240s) - DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD (default: 300s) - DEEPNOTE_EXECUTION_AUTO_INTERRUPT (default: false) --- deepnote_toolkit/execution_timeout.py | 237 ++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 deepnote_toolkit/execution_timeout.py diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py new file mode 100644 index 0000000..19945c1 --- /dev/null +++ b/deepnote_toolkit/execution_timeout.py @@ -0,0 +1,237 @@ +""" +This module provides execution timeout monitoring for Jupyter notebook cells. +It can detect long-running executions and optionally send warnings or interrupt them. +""" + +import os +import signal +import threading +from typing import Optional + +import requests +from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult + +from .get_webapp_url import get_webapp_url +from .logging import LoggerManager + + +class ExecutionTimeoutMonitor: + """ + Monitors execution duration and can send warnings or interrupt stuck executions. + """ + + def __init__( + self, + warning_threshold_seconds: int = 240, + timeout_seconds: int = 300, + enable_auto_interrupt: bool = False, + ): + """ + Initialize the execution timeout monitor. + + Args: + warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min) + timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min) + enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False) + """ + self.logger = LoggerManager.get_logger("execution_timeout") + self.warning_threshold = warning_threshold_seconds + self.timeout_threshold = timeout_seconds + self.enable_auto_interrupt = enable_auto_interrupt + self.current_execution: Optional[dict] = None + self.warning_timer: Optional[threading.Timer] = None + self.timeout_timer: Optional[threading.Timer] = None + + def on_pre_execute(self, info: ExecutionInfo) -> None: + """ + Called before executing a cell. + Starts timers for warning and timeout. + """ + import time + + cell_preview = info.raw_cell[:100] if info.raw_cell else "" + + self.current_execution = { + "code": cell_preview, + "start": time.time(), + } + + # Start warning timer + if self.warning_threshold > 0: + self.warning_timer = threading.Timer( + self.warning_threshold, self._send_warning + ) + self.warning_timer.daemon = True + self.warning_timer.start() + + # Start timeout timer + if self.enable_auto_interrupt and self.timeout_threshold > 0: + self.timeout_timer = threading.Timer( + self.timeout_threshold, self._interrupt_execution + ) + self.timeout_timer.daemon = True + self.timeout_timer.start() + + self.logger.debug( + "Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s", + self.warning_threshold, + self.timeout_threshold, + self.enable_auto_interrupt, + ) + + def on_post_execute(self, result: ExecutionResult) -> None: + """ + Called after executing a cell. + Cancels any pending timers. + """ + self._cancel_timers() + self.current_execution = None + + def _cancel_timers(self) -> None: + """Cancel all active timers.""" + if self.warning_timer: + self.warning_timer.cancel() + self.warning_timer = None + if self.timeout_timer: + self.timeout_timer.cancel() + self.timeout_timer = None + + def _send_warning(self) -> None: + """Send warning when execution is running longer than threshold.""" + if not self.current_execution: + return + + import time + + duration = time.time() - self.current_execution["start"] + code_preview = self.current_execution["code"][:50] + + self.logger.warning( + "LONG_EXECUTION | duration=%.1fs | preview=%s", + duration, + code_preview.replace("\n", "\\n"), + ) + + # Try to report to webapp + self._report_to_webapp(duration, code_preview, warning=True) + + def _interrupt_execution(self) -> None: + """Interrupt execution after timeout threshold is exceeded.""" + if not self.current_execution: + return + + import time + + duration = time.time() - self.current_execution["start"] + + self.logger.error( + "TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution", + duration, + ) + + # Report to webapp before interrupting + self._report_to_webapp( + duration, self.current_execution["code"][:50], warning=False + ) + + # Send SIGINT to interrupt the execution (simulates Ctrl+C) + try: + os.kill(os.getpid(), signal.SIGINT) + except Exception as e: # pylint: disable=broad-exception-caught + self.logger.error("Failed to send SIGINT: %s", e) + + def _report_to_webapp( + self, duration: float, code_preview: str, warning: bool + ) -> None: + """ + Report execution warning/timeout to webapp. + + Args: + duration: Execution duration in seconds + code_preview: Preview of the code being executed + warning: Whether this is a warning (True) or timeout (False) + """ + try: + webapp_url = get_webapp_url() + project_id = os.getenv("DEEPNOTE_PROJECT_ID") + + if not webapp_url or not project_id: + self.logger.debug( + "Webapp URL or project ID not available, skipping report" + ) + return + + endpoint = ( + "warning" if warning else "timeout" + ) + url = f"{webapp_url}/userpod-api/{project_id}/execution/{endpoint}" + + payload = { + "duration": duration, + "code_preview": code_preview, + "threshold": ( + self.warning_threshold if warning else self.timeout_threshold + ), + } + + response = requests.post(url, json=payload, timeout=2) + response.raise_for_status() + + self.logger.debug("Successfully reported %s to webapp", endpoint) + + except Exception as e: # pylint: disable=broad-exception-caught + self.logger.error("Failed to report to webapp: %s", e) + + +# Global instance +_timeout_monitor: Optional[ExecutionTimeoutMonitor] = None + + +def setup_execution_timeout_monitor( + warning_threshold_seconds: int = 240, + timeout_seconds: int = 300, + enable_auto_interrupt: bool = False, +) -> None: + """ + Set up execution timeout monitoring. + + This is optional and should be called during runtime initialization if needed. + + Args: + warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min) + timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min) + enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False) + """ + global _timeout_monitor # pylint: disable=global-statement + + try: + from IPython import get_ipython + + ip = get_ipython() + if ip is None: + LoggerManager.get_logger("execution_timeout").warning( + "IPython instance not available, skipping timeout monitor setup" + ) + return + + _timeout_monitor = ExecutionTimeoutMonitor( + warning_threshold_seconds=warning_threshold_seconds, + timeout_seconds=timeout_seconds, + enable_auto_interrupt=enable_auto_interrupt, + ) + + # Register event handlers + ip.events.register("pre_execute", _timeout_monitor.on_pre_execute) + ip.events.register("post_execute", _timeout_monitor.on_post_execute) + + LoggerManager.get_logger("execution_timeout").info( + "Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s", + warning_threshold_seconds, + timeout_seconds, + enable_auto_interrupt, + ) + + except Exception as e: # pylint: disable=broad-exception-caught + LoggerManager.get_logger("execution_timeout").error( + "Failed to set up timeout monitor: %s", e + ) From 1f819e35f9e2845a4399983f8fe586319bf8c6e0 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 17:59:40 +0100 Subject: [PATCH 3/7] Add execution metadata publishing to webapp - Add publish_execution_metadata() function - Define DEEPNOTE_EXECUTION_METADATA_MIME_TYPE constant - Publish structured execution data via display_pub - Include duration, success status, and error type in metadata --- deepnote_toolkit/ipython_utils.py | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/deepnote_toolkit/ipython_utils.py b/deepnote_toolkit/ipython_utils.py index a789985..b51d749 100644 --- a/deepnote_toolkit/ipython_utils.py +++ b/deepnote_toolkit/ipython_utils.py @@ -1,7 +1,12 @@ +from typing import Optional + from IPython import get_ipython # also defined in https://github.com/deepnote/deepnote/blob/a9f36659f50c84bd85aeba8ee2d3d4458f2f4998/libs/shared/src/constants.ts#L47 DEEPNOTE_SQL_METADATA_MIME_TYPE = "application/vnd.deepnote.sql-output-metadata+json" +DEEPNOTE_EXECUTION_METADATA_MIME_TYPE = ( + "application/vnd.deepnote.execution-metadata+json" +) def output_display_data(mime_bundle): @@ -24,3 +29,37 @@ def output_sql_metadata(metadata: dict): None """ output_display_data({DEEPNOTE_SQL_METADATA_MIME_TYPE: metadata}) + + +def publish_execution_metadata( + execution_count: int, + duration: float, + success: bool, + error_type: Optional[str] = None, +) -> None: + """ + Publish execution metadata to the webapp via display_pub. + + This function publishes structured metadata about cell execution that can be + consumed by the webapp for monitoring, debugging, and analytics purposes. + + Args: + execution_count (int): The execution count of the cell + duration (float): Execution duration in seconds + success (bool): Whether the execution completed successfully + error_type (str, optional): The type of error if execution failed + + Returns: + None + """ + metadata = { + "execution_count": execution_count, + "duration_seconds": duration, + "success": success, + "timestamp": duration, # Using duration as timestamp for now + } + + if error_type: + metadata["error_type"] = error_type + + output_display_data({DEEPNOTE_EXECUTION_METADATA_MIME_TYPE: metadata}) From 3fa8852c847647163b44791f9dd8a520dabeb843 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 17:59:47 +0100 Subject: [PATCH 4/7] Integrate execution tracking and timeout monitoring into runtime - Import and setup execution tracking during runtime init - Add optional execution timeout monitor setup - Configure timeout monitor via environment variables - Add error handling for both features - Maintain backward compatibility (timeout monitor disabled by default) --- deepnote_toolkit/runtime_initialization.py | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/deepnote_toolkit/runtime_initialization.py b/deepnote_toolkit/runtime_initialization.py index d8a7e3c..bd1b4b9 100644 --- a/deepnote_toolkit/runtime_initialization.py +++ b/deepnote_toolkit/runtime_initialization.py @@ -1,6 +1,7 @@ """ This module contains functions to set up the Deepnote runtime environment. """ import builtins +import os import IPython.core.page import psycopg2.extensions @@ -8,6 +9,8 @@ from .dataframe_utils import add_formatters from .execute_post_start_hooks import execute_post_start_hooks +from .execution_timeout import setup_execution_timeout_monitor +from .execution_tracking import setup_execution_tracking from .logging import LoggerManager from .output_middleware import add_output_middleware from .set_integrations_env import set_integration_env @@ -44,6 +47,35 @@ def init_deepnote_runtime(): except Exception as e: # pylint: disable=broad-exception-caught logger.error("Failed to add output middleware with a error: %s", e) + # Set up execution tracking for monitoring cell execution + try: + logger.debug("Setting up execution tracking.") + setup_execution_tracking() + except Exception as e: # pylint: disable=broad-exception-caught + logger.error("Failed to set up execution tracking with a error: %s", e) + + # Set up execution timeout monitor (optional, controlled by env vars) + enable_timeout_monitor = os.getenv("DEEPNOTE_ENABLE_EXECUTION_TIMEOUT", "false").lower() == "true" + if enable_timeout_monitor: + try: + warning_threshold = int(os.getenv("DEEPNOTE_EXECUTION_WARNING_THRESHOLD", "240")) + timeout_threshold = int(os.getenv("DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD", "300")) + auto_interrupt = os.getenv("DEEPNOTE_EXECUTION_AUTO_INTERRUPT", "false").lower() == "true" + + logger.debug( + "Setting up execution timeout monitor: warning=%ds, timeout=%ds, auto_interrupt=%s", + warning_threshold, + timeout_threshold, + auto_interrupt, + ) + setup_execution_timeout_monitor( + warning_threshold_seconds=warning_threshold, + timeout_seconds=timeout_threshold, + enable_auto_interrupt=auto_interrupt, + ) + except Exception as e: # pylint: disable=broad-exception-caught + logger.error("Failed to set up execution timeout monitor with a error: %s", e) + # Set up psycopg2 to make long-running queries interruptible by SIGINT (interrupt kernel) try: logger.debug("Setting psycopg2.") From b525a2017fb5aebfdea70c2cf67d5b23fc1f47e0 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 18:00:01 +0100 Subject: [PATCH 5/7] Add comprehensive documentation for execution tracking features - Document execution tracking and timeout monitoring - Include configuration examples and environment variables - Provide debugging guide for stuck executions - Explain log formats and locations - Add testing instructions and examples - List all modified/created files - Include future enhancement ideas --- docs/EXECUTION_TRACKING.md | 237 +++++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 docs/EXECUTION_TRACKING.md diff --git a/docs/EXECUTION_TRACKING.md b/docs/EXECUTION_TRACKING.md new file mode 100644 index 0000000..e88a051 --- /dev/null +++ b/docs/EXECUTION_TRACKING.md @@ -0,0 +1,237 @@ +# Execution Tracking and Timeout Monitoring + +This document describes the execution tracking and timeout monitoring features added to improve observability and debugging of stuck Jupyter notebook executions. + +## Overview + +The execution tracking system provides: + +1. **Execution Event Tracking**: Logs every cell execution with start/end times, duration, and success status +2. **Execution Metadata Publishing**: Publishes structured metadata to the webapp for monitoring +3. **ZMQ Message Flow Logging**: Enhanced debugging of kernel-server communication +4. **Execution Timeout Monitoring**: Optional monitoring with warnings and auto-interrupt for stuck executions + +## Features + +### 1. Execution Event Tracking + +**Location**: `deepnote_toolkit/execution_tracking.py` + +Automatically tracks all notebook cell executions and logs: +- Execution start with cell preview +- Execution end with duration and success status +- Error types if execution fails + +**Log Format**: +``` +EXEC_START | count=1 | cell_id=12345 | preview=import pandas as pd\ndf = pd.read_csv(... +EXEC_END | count=1 | duration=2.45s | success=True +``` + +This is **enabled by default** and requires no configuration. + +### 2. Execution Metadata Publishing + +**Location**: `deepnote_toolkit/ipython_utils.py` + +Publishes execution metadata to the webapp via the `application/vnd.deepnote.execution-metadata+json` MIME type. + +**Metadata Structure**: +```python +{ + "execution_count": 1, + "duration_seconds": 2.45, + "success": True, + "error_type": "KeyError" # Only present if execution failed +} +``` + +This allows the webapp to: +- Display execution duration in the UI +- Track execution history +- Alert on failed executions +- Build execution analytics + +### 3. ZMQ Session Debug Logging + +**Location**: `deepnote_core/resources/jupyter/jupyter_server_config.py:817` + +Enables detailed ZMQ message logging by setting: +```python +c.Session.debug = True +``` + +This logs all ZMQ messages exchanged between the Jupyter server and kernel, which is critical for debugging: +- Message delays +- Message loss +- Protocol issues +- Kernel communication problems + +**Enhanced Logging Configuration** (line 418-426): +```python +c.ServerApp.logging_config = { + "loggers": { + "tornado.access": {"level": "DEBUG"}, + "jupyter_server.serverapp": {"level": "DEBUG"}, + "jupyter_client.session": {"level": "DEBUG"}, + "execution_tracker": {"level": "INFO"}, + "execution_tracking": {"level": "INFO"}, + } +} +``` + +### 4. Execution Timeout Monitoring (Optional) + +**Location**: `deepnote_toolkit/execution_timeout.py` + +Optional feature that monitors execution duration and can: +- Send warnings when executions exceed a threshold +- Report to the webapp +- Automatically interrupt stuck executions (if enabled) + +**Configuration via Environment Variables**: + +```bash +# Enable timeout monitoring (default: false) +export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true + +# Warning threshold in seconds (default: 240 = 4 minutes) +export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=240 + +# Timeout threshold in seconds (default: 300 = 5 minutes) +export DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD=300 + +# Enable automatic interrupt of stuck executions (default: false) +# WARNING: Use with caution! This will send SIGINT to interrupt executions +export DEEPNOTE_EXECUTION_AUTO_INTERRUPT=false +``` + +**Example Usage**: +```bash +# Enable with default thresholds +export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true + +# Enable with custom thresholds and auto-interrupt +export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true +export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=120 # Warn after 2 minutes +export DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD=180 # Timeout after 3 minutes +export DEEPNOTE_EXECUTION_AUTO_INTERRUPT=true # Auto-interrupt +``` + +## Log Locations + +All execution tracking logs are written to the standard Deepnote log location: +- **File**: `$DEEPNOTE_LOG_DIR/helpers.log` (or XDG default) +- **Scraped by**: Loki for centralized logging + +## Debugging Stuck Executions + +When investigating stuck executions, follow this process: + +1. **Check Execution Logs**: Look for `EXEC_START` without matching `EXEC_END` + ```bash + grep "EXEC_START\|EXEC_END" $DEEPNOTE_LOG_DIR/helpers.log + ``` + +2. **Check ZMQ Message Flow**: Look for message delays or missing messages + ```bash + grep "Session.send\|Session.recv" /var/log/jupyter-server.log + ``` + +3. **Check for Long Executions**: Look for `LONG_EXECUTION` warnings + ```bash + grep "LONG_EXECUTION" $DEEPNOTE_LOG_DIR/helpers.log + ``` + +4. **Enable Timeout Monitoring**: If not already enabled, set environment variables and restart kernel + +## Implementation Details + +### Execution Tracking Flow + +1. User executes a cell +2. IPython fires `pre_execute` event → `ExecutionTracker.on_pre_execute()` + - Logs `EXEC_START` + - Records start time and cell metadata +3. Cell executes... +4. IPython fires `post_execute` event → `ExecutionTracker.on_post_execute()` + - Calculates duration + - Logs `EXEC_END` with duration and success status + - Publishes metadata to webapp + +### Timeout Monitoring Flow (if enabled) + +1. `pre_execute` event → `ExecutionTimeoutMonitor.on_pre_execute()` + - Starts warning timer (default: 240s) + - Starts timeout timer (default: 300s) if auto-interrupt enabled +2. If warning timer fires → `_send_warning()` + - Logs `LONG_EXECUTION` warning + - Reports to webapp +3. If timeout timer fires → `_interrupt_execution()` + - Logs `TIMEOUT_INTERRUPT` error + - Reports to webapp + - Sends SIGINT to kernel (if auto-interrupt enabled) +4. `post_execute` event → cancels all timers + +## Files Modified/Created + +### New Files +- `deepnote_toolkit/execution_tracking.py` - Execution tracking implementation +- `deepnote_toolkit/execution_timeout.py` - Timeout monitoring implementation +- `docs/EXECUTION_TRACKING.md` - This documentation + +### Modified Files +- `deepnote_toolkit/runtime_initialization.py` - Added tracking and timeout setup +- `deepnote_toolkit/ipython_utils.py` - Added execution metadata publishing +- `deepnote_core/resources/jupyter/jupyter_server_config.py` - Enhanced logging config + +## Testing + +To test the execution tracking: + +1. Start a Jupyter kernel with the toolkit +2. Execute a simple cell: + ```python + print("Hello, world!") + ``` +3. Check logs for execution tracking: + ```bash + tail -f $DEEPNOTE_LOG_DIR/helpers.log | grep "EXEC_" + ``` +4. You should see: + ``` + EXEC_START | count=1 | cell_id=... | preview=print("Hello, world!") + EXEC_END | count=1 | duration=0.01s | success=True + ``` + +To test timeout monitoring: + +1. Enable timeout monitoring: + ```bash + export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true + export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=5 # 5 seconds for testing + ``` +2. Restart kernel +3. Execute a long-running cell: + ```python + import time + time.sleep(10) + ``` +4. After 5 seconds, you should see a `LONG_EXECUTION` warning in the logs + +## Future Enhancements + +Potential improvements to consider: + +1. **Kernel Heartbeat Monitoring**: Monitor ZMQ heartbeat channel to detect dead kernels +2. **Message Queue Metrics**: Track IOPub message queue depth to detect backpressure +3. **Execution History**: Store execution history in database for analytics +4. **WebApp Integration**: Display execution metrics in real-time in the UI +5. **Adaptive Timeouts**: Adjust timeout thresholds based on historical execution patterns +6. **Execution Profiling**: Integrate with Python profilers for detailed performance analysis + +## References + +- Jupyter Client Protocol: https://jupyter-client.readthedocs.io/en/stable/messaging.html +- IPython Events: https://ipython.readthedocs.io/en/stable/config/callbacks.html +- ZMQ Guide: https://zguide.zeromq.org/ From 4020118cfa59fbad863215a0cabc32518e273665 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 18:05:21 +0100 Subject: [PATCH 6/7] Fix LoggerManager and webapp URL imports - Fix LoggerManager usage: use LoggerManager().get_logger() instead of LoggerManager.get_logger() - Fix webapp URL import: use get_absolute_userpod_api_url() instead of non-existent get_webapp_url() - All imports and functionality tests now pass --- deepnote_toolkit/execution_timeout.py | 25 +++++++------------------ deepnote_toolkit/execution_tracking.py | 8 ++++---- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py index 19945c1..66d61cd 100644 --- a/deepnote_toolkit/execution_timeout.py +++ b/deepnote_toolkit/execution_timeout.py @@ -11,7 +11,7 @@ import requests from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult -from .get_webapp_url import get_webapp_url +from .get_webapp_url import get_absolute_userpod_api_url from .logging import LoggerManager @@ -34,7 +34,7 @@ def __init__( timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min) enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False) """ - self.logger = LoggerManager.get_logger("execution_timeout") + self.logger = LoggerManager().get_logger() self.warning_threshold = warning_threshold_seconds self.timeout_threshold = timeout_seconds self.enable_auto_interrupt = enable_auto_interrupt @@ -152,19 +152,8 @@ def _report_to_webapp( warning: Whether this is a warning (True) or timeout (False) """ try: - webapp_url = get_webapp_url() - project_id = os.getenv("DEEPNOTE_PROJECT_ID") - - if not webapp_url or not project_id: - self.logger.debug( - "Webapp URL or project ID not available, skipping report" - ) - return - - endpoint = ( - "warning" if warning else "timeout" - ) - url = f"{webapp_url}/userpod-api/{project_id}/execution/{endpoint}" + endpoint = "warning" if warning else "timeout" + url = get_absolute_userpod_api_url(f"execution/{endpoint}") payload = { "duration": duration, @@ -209,7 +198,7 @@ def setup_execution_timeout_monitor( ip = get_ipython() if ip is None: - LoggerManager.get_logger("execution_timeout").warning( + LoggerManager().get_logger().warning( "IPython instance not available, skipping timeout monitor setup" ) return @@ -224,7 +213,7 @@ def setup_execution_timeout_monitor( ip.events.register("pre_execute", _timeout_monitor.on_pre_execute) ip.events.register("post_execute", _timeout_monitor.on_post_execute) - LoggerManager.get_logger("execution_timeout").info( + LoggerManager().get_logger().info( "Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s", warning_threshold_seconds, timeout_seconds, @@ -232,6 +221,6 @@ def setup_execution_timeout_monitor( ) except Exception as e: # pylint: disable=broad-exception-caught - LoggerManager.get_logger("execution_timeout").error( + LoggerManager().get_logger().error( "Failed to set up timeout monitor: %s", e ) diff --git a/deepnote_toolkit/execution_tracking.py b/deepnote_toolkit/execution_tracking.py index 3afe4e5..3ac5b39 100644 --- a/deepnote_toolkit/execution_tracking.py +++ b/deepnote_toolkit/execution_tracking.py @@ -17,7 +17,7 @@ class ExecutionTracker: """Tracks execution state of notebook cells.""" def __init__(self): - self.logger = LoggerManager.get_logger("execution_tracker") + self.logger = LoggerManager().get_logger() self.current_execution: Optional[Dict[str, Any]] = None self.execution_count = 0 @@ -109,7 +109,7 @@ def setup_execution_tracking() -> None: ip = get_ipython() if ip is None: - LoggerManager.get_logger("execution_tracking").warning( + LoggerManager().get_logger().warning( "IPython instance not available, skipping execution tracking setup" ) return @@ -122,11 +122,11 @@ def setup_execution_tracking() -> None: ip.events.register("pre_run_cell", _execution_tracker.on_pre_run_cell) ip.events.register("post_run_cell", _execution_tracker.on_post_run_cell) - LoggerManager.get_logger("execution_tracking").info( + LoggerManager().get_logger().info( "Execution tracking initialized successfully" ) except Exception as e: # pylint: disable=broad-exception-caught - LoggerManager.get_logger("execution_tracking").error( + LoggerManager().get_logger().error( "Failed to set up execution tracking: %s", e ) From 8063f60cec2d327cff99b1a14607920e1a657da8 Mon Sep 17 00:00:00 2001 From: Hannes Probst Date: Fri, 14 Nov 2025 18:18:36 +0100 Subject: [PATCH 7/7] Fix blocking issues from PR review - Fix incorrect timestamp: use time.time() instead of duration in metadata - Move time imports to module level in execution_timeout.py - Add threading lock to fix race condition in timeout monitoring - Protect current_execution access with lock - Copy execution data before processing outside lock - All fixes validated and tested --- deepnote_toolkit/execution_timeout.py | 79 ++++++++++++++------------- deepnote_toolkit/ipython_utils.py | 3 +- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py index 66d61cd..b764734 100644 --- a/deepnote_toolkit/execution_timeout.py +++ b/deepnote_toolkit/execution_timeout.py @@ -6,6 +6,7 @@ import os import signal import threading +import time from typing import Optional import requests @@ -41,36 +42,36 @@ def __init__( self.current_execution: Optional[dict] = None self.warning_timer: Optional[threading.Timer] = None self.timeout_timer: Optional[threading.Timer] = None + self._execution_lock = threading.Lock() def on_pre_execute(self, info: ExecutionInfo) -> None: """ Called before executing a cell. Starts timers for warning and timeout. """ - import time - cell_preview = info.raw_cell[:100] if info.raw_cell else "" - self.current_execution = { - "code": cell_preview, - "start": time.time(), - } - - # Start warning timer - if self.warning_threshold > 0: - self.warning_timer = threading.Timer( - self.warning_threshold, self._send_warning - ) - self.warning_timer.daemon = True - self.warning_timer.start() + with self._execution_lock: + self.current_execution = { + "code": cell_preview, + "start": time.time(), + } - # Start timeout timer - if self.enable_auto_interrupt and self.timeout_threshold > 0: - self.timeout_timer = threading.Timer( - self.timeout_threshold, self._interrupt_execution - ) - self.timeout_timer.daemon = True - self.timeout_timer.start() + # Start warning timer + if self.warning_threshold > 0: + self.warning_timer = threading.Timer( + self.warning_threshold, self._send_warning + ) + self.warning_timer.daemon = True + self.warning_timer.start() + + # Start timeout timer + if self.enable_auto_interrupt and self.timeout_threshold > 0: + self.timeout_timer = threading.Timer( + self.timeout_threshold, self._interrupt_execution + ) + self.timeout_timer.daemon = True + self.timeout_timer.start() self.logger.debug( "Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s", @@ -84,8 +85,9 @@ def on_post_execute(self, result: ExecutionResult) -> None: Called after executing a cell. Cancels any pending timers. """ - self._cancel_timers() - self.current_execution = None + with self._execution_lock: + self._cancel_timers() + self.current_execution = None def _cancel_timers(self) -> None: """Cancel all active timers.""" @@ -98,13 +100,15 @@ def _cancel_timers(self) -> None: def _send_warning(self) -> None: """Send warning when execution is running longer than threshold.""" - if not self.current_execution: - return - - import time + # Capture execution data while holding lock + with self._execution_lock: + if not self.current_execution: + return + execution_data = self.current_execution.copy() - duration = time.time() - self.current_execution["start"] - code_preview = self.current_execution["code"][:50] + # Process outside lock to avoid blocking + duration = time.time() - execution_data["start"] + code_preview = execution_data["code"][:50] self.logger.warning( "LONG_EXECUTION | duration=%.1fs | preview=%s", @@ -117,12 +121,15 @@ def _send_warning(self) -> None: def _interrupt_execution(self) -> None: """Interrupt execution after timeout threshold is exceeded.""" - if not self.current_execution: - return + # Capture execution data while holding lock + with self._execution_lock: + if not self.current_execution: + return + execution_data = self.current_execution.copy() - import time - - duration = time.time() - self.current_execution["start"] + # Process outside lock to avoid blocking + duration = time.time() - execution_data["start"] + code_preview = execution_data["code"][:50] self.logger.error( "TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution", @@ -130,9 +137,7 @@ def _interrupt_execution(self) -> None: ) # Report to webapp before interrupting - self._report_to_webapp( - duration, self.current_execution["code"][:50], warning=False - ) + self._report_to_webapp(duration, code_preview, warning=False) # Send SIGINT to interrupt the execution (simulates Ctrl+C) try: diff --git a/deepnote_toolkit/ipython_utils.py b/deepnote_toolkit/ipython_utils.py index b51d749..67813fc 100644 --- a/deepnote_toolkit/ipython_utils.py +++ b/deepnote_toolkit/ipython_utils.py @@ -1,3 +1,4 @@ +import time from typing import Optional from IPython import get_ipython @@ -56,7 +57,7 @@ def publish_execution_metadata( "execution_count": execution_count, "duration_seconds": duration, "success": success, - "timestamp": duration, # Using duration as timestamp for now + "timestamp": time.time(), } if error_type: