From 50af98e2b14e723b2b18a68156bd78126927e74b Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 17:59:24 +0100
Subject: [PATCH 1/7] Add execution tracking for Jupyter cell executions

- Track execution start/end with timestamps
- Log execution duration and success status
- Publish execution metadata to webapp
- Register IPython event handlers for monitoring
---
 deepnote_toolkit/execution_tracking.py | 132 +++++++++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 deepnote_toolkit/execution_tracking.py
diff --git a/deepnote_toolkit/execution_tracking.py b/deepnote_toolkit/execution_tracking.py
new file mode 100644
index 0000000..3afe4e5
--- /dev/null
+++ b/deepnote_toolkit/execution_tracking.py
@@ -0,0 +1,132 @@
+"""
+This module provides execution tracking for Jupyter notebook cells.
+It monitors execution start/end events, duration, and publishes metadata to help
+debug stuck executions and improve observability.
+"""
+
+import time
+from typing import Any, Dict, Optional
+
+from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult
+
+from .ipython_utils import publish_execution_metadata
+from .logging import LoggerManager
+
+
+class ExecutionTracker:
+    """Tracks execution state of notebook cells."""
+
+    def __init__(self):
+        self.logger = LoggerManager.get_logger("execution_tracker")
+        self.current_execution: Optional[Dict[str, Any]] = None
+        self.execution_count = 0
+
+    def on_pre_execute(self, info: ExecutionInfo) -> None:
+        """
+        Called before executing a cell.
+        Logs execution start and tracks execution metadata.
+        """
+        self.execution_count += 1
+        cell_preview = info.raw_cell[:100] if info.raw_cell else "<empty>"
+        cell_id = hash(info.raw_cell) if info.raw_cell else 0
+
+        self.current_execution = {
+            "cell_id": cell_id,
+            "code_preview": cell_preview,
+            "start_time": time.time(),
+            "execution_count": self.execution_count,
+        }
+
+        self.logger.info(
+            "EXEC_START | count=%d | cell_id=%s | preview=%s",
+            self.execution_count,
+            cell_id,
+            cell_preview[:50].replace("\n", "\\n"),
+        )
+
+    def on_post_execute(self, result: ExecutionResult) -> None:
+        """
+        Called after executing a cell.
+        Logs execution end, duration, and success status.
+        Publishes execution metadata for webapp consumption.
+        """
+        if not self.current_execution:
+            self.logger.warning("EXEC_END called without matching EXEC_START")
+            return
+
+        duration = time.time() - self.current_execution["start_time"]
+        success = result.error_in_exec is None
+        error_name = (
+            type(result.error_in_exec).__name__ if result.error_in_exec else None
+        )
+
+        self.logger.info(
+            "EXEC_END | count=%d | duration=%.2fs | success=%s%s",
+            self.current_execution["execution_count"],
+            duration,
+            success,
+            f" | error={error_name}" if error_name else "",
+        )
+
+        # Publish metadata to webapp
+        try:
+            publish_execution_metadata(
+                execution_count=self.current_execution["execution_count"],
+                duration=duration,
+                success=success,
+                error_type=error_name,
+            )
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            self.logger.error("Failed to publish execution metadata: %s", e)
+
+        self.current_execution = None
+
+    def on_pre_run_cell(self, info: ExecutionInfo) -> None:
+        """Called before running a cell (before pre_execute)."""
+        cell_preview = info.raw_cell[:30] if info.raw_cell else "<empty>"
+        self.logger.debug(
+            "PRE_RUN | preview=%s", cell_preview.replace("\n", "\\n")
+        )
+
+    def on_post_run_cell(self, result: ExecutionResult) -> None:
+        """Called after running a cell (after post_execute)."""
+        self.logger.debug("POST_RUN | exec_count=%s", result.execution_count)
+
+
+# Global instance
+_execution_tracker: Optional[ExecutionTracker] = None
+
+
+def setup_execution_tracking() -> None:
+    """
+    Set up IPython event handlers for execution tracking.
+    This should be called during runtime initialization.
+    """
+    global _execution_tracker  # pylint: disable=global-statement
+
+    try:
+        from IPython import get_ipython
+
+        ip = get_ipython()
+        if ip is None:
+            LoggerManager.get_logger("execution_tracking").warning(
+                "IPython instance not available, skipping execution tracking setup"
+            )
+            return
+
+        _execution_tracker = ExecutionTracker()
+
+        # Register event handlers
+        ip.events.register("pre_execute", _execution_tracker.on_pre_execute)
+        ip.events.register("post_execute", _execution_tracker.on_post_execute)
+        ip.events.register("pre_run_cell", _execution_tracker.on_pre_run_cell)
+        ip.events.register("post_run_cell", _execution_tracker.on_post_run_cell)
+
+        LoggerManager.get_logger("execution_tracking").info(
+            "Execution tracking initialized successfully"
+        )
+
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        LoggerManager.get_logger("execution_tracking").error(
+            "Failed to set up execution tracking: %s", e
+        )

From c1d4ccae9f6855ee56877bebf94291a384715b56 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 17:59:34 +0100
Subject: [PATCH 2/7] Add optional execution timeout monitoring

- Monitor long-running executions with configurable thresholds
- Send warnings when executions exceed warning threshold
- Optional auto-interrupt for stuck executions via SIGINT
- Report warnings/timeouts to webapp
- Configurable via environment variables:
  - DEEPNOTE_ENABLE_EXECUTION_TIMEOUT
  - DEEPNOTE_EXECUTION_WARNING_THRESHOLD (default: 240s)
  - DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD (default: 300s)
  - DEEPNOTE_EXECUTION_AUTO_INTERRUPT (default: false)
---
 deepnote_toolkit/execution_timeout.py | 237 ++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 deepnote_toolkit/execution_timeout.py

diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py
new file mode 100644
index 0000000..19945c1
--- /dev/null
+++ b/deepnote_toolkit/execution_timeout.py
@@ -0,0 +1,237 @@
+"""
+This module provides execution timeout monitoring for Jupyter notebook cells.
+It can detect long-running executions and optionally send warnings or interrupt them.
+"""
+
+import os
+import signal
+import threading
+from typing import Optional
+
+import requests
+from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult
+
+from .get_webapp_url import get_webapp_url
+from .logging import LoggerManager
+
+
+class ExecutionTimeoutMonitor:
+    """
+    Monitors execution duration and can send warnings or interrupt stuck executions.
+    """
+
+    def __init__(
+        self,
+        warning_threshold_seconds: int = 240,
+        timeout_seconds: int = 300,
+        enable_auto_interrupt: bool = False,
+    ):
+        """
+        Initialize the execution timeout monitor.
+
+        Args:
+            warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
+            timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
+            enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
+        """
+        self.logger = LoggerManager.get_logger("execution_timeout")
+        self.warning_threshold = warning_threshold_seconds
+        self.timeout_threshold = timeout_seconds
+        self.enable_auto_interrupt = enable_auto_interrupt
+        self.current_execution: Optional[dict] = None
+        self.warning_timer: Optional[threading.Timer] = None
+        self.timeout_timer: Optional[threading.Timer] = None
+
+    def on_pre_execute(self, info: ExecutionInfo) -> None:
+        """
+        Called before executing a cell.
+        Starts timers for warning and timeout.
+        """
+        import time
+
+        cell_preview = info.raw_cell[:100] if info.raw_cell else "<empty>"
+
+        self.current_execution = {
+            "code": cell_preview,
+            "start": time.time(),
+        }
+
+        # Start warning timer
+        if self.warning_threshold > 0:
+            self.warning_timer = threading.Timer(
+                self.warning_threshold, self._send_warning
+            )
+            self.warning_timer.daemon = True
+            self.warning_timer.start()
+
+        # Start timeout timer
+        if self.enable_auto_interrupt and self.timeout_threshold > 0:
+            self.timeout_timer = threading.Timer(
+                self.timeout_threshold, self._interrupt_execution
+            )
+            self.timeout_timer.daemon = True
+            self.timeout_timer.start()
+
+        self.logger.debug(
+            "Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s",
+            self.warning_threshold,
+            self.timeout_threshold,
+            self.enable_auto_interrupt,
+        )
+
+    def on_post_execute(self, result: ExecutionResult) -> None:
+        """
+        Called after executing a cell.
+        Cancels any pending timers.
+        """
+        self._cancel_timers()
+        self.current_execution = None
+
+    def _cancel_timers(self) -> None:
+        """Cancel all active timers."""
+        if self.warning_timer:
+            self.warning_timer.cancel()
+            self.warning_timer = None
+        if self.timeout_timer:
+            self.timeout_timer.cancel()
+            self.timeout_timer = None
+
+    def _send_warning(self) -> None:
+        """Send warning when execution is running longer than threshold."""
+        if not self.current_execution:
+            return
+
+        import time
+
+        duration = time.time() - self.current_execution["start"]
+        code_preview = self.current_execution["code"][:50]
+
+        self.logger.warning(
+            "LONG_EXECUTION | duration=%.1fs | preview=%s",
+            duration,
+            code_preview.replace("\n", "\\n"),
+        )
+
+        # Try to report to webapp
+        self._report_to_webapp(duration, code_preview, warning=True)
+
+    def _interrupt_execution(self) -> None:
+        """Interrupt execution after timeout threshold is exceeded."""
+        if not self.current_execution:
+            return
+
+        import time
+
+        duration = time.time() - self.current_execution["start"]
+
+        self.logger.error(
+            "TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution",
+            duration,
+        )
+
+        # Report to webapp before interrupting
+        self._report_to_webapp(
+            duration, self.current_execution["code"][:50], warning=False
+        )
+
+        # Send SIGINT to interrupt the execution (simulates Ctrl+C)
+        try:
+            os.kill(os.getpid(), signal.SIGINT)
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            self.logger.error("Failed to send SIGINT: %s", e)
+
+    def _report_to_webapp(
+        self, duration: float, code_preview: str, warning: bool
+    ) -> None:
+        """
+        Report execution warning/timeout to webapp.
+
+        Args:
+            duration: Execution duration in seconds
+            code_preview: Preview of the code being executed
+            warning: Whether this is a warning (True) or timeout (False)
+        """
+        try:
+            webapp_url = get_webapp_url()
+            project_id = os.getenv("DEEPNOTE_PROJECT_ID")
+
+            if not webapp_url or not project_id:
+                self.logger.debug(
+                    "Webapp URL or project ID not available, skipping report"
+                )
+                return
+
+            endpoint = (
+                "warning" if warning else "timeout"
+            )
+            url = f"{webapp_url}/userpod-api/{project_id}/execution/{endpoint}"
+
+            payload = {
+                "duration": duration,
+                "code_preview": code_preview,
+                "threshold": (
+                    self.warning_threshold if warning else self.timeout_threshold
+                ),
+            }
+
+            response = requests.post(url, json=payload, timeout=2)
+            response.raise_for_status()
+
+            self.logger.debug("Successfully reported %s to webapp", endpoint)
+
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            self.logger.error("Failed to report to webapp: %s", e)
+
+
+# Global instance
+_timeout_monitor: Optional[ExecutionTimeoutMonitor] = None
+
+
+def setup_execution_timeout_monitor(
+    warning_threshold_seconds: int = 240,
+    timeout_seconds: int = 300,
+    enable_auto_interrupt: bool = False,
+) -> None:
+    """
+    Set up execution timeout monitoring.
+
+    This is optional and should be called during runtime initialization if needed.
+
+    Args:
+        warning_threshold_seconds: Seconds after which to send a warning (default: 240s = 4min)
+        timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
+        enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
+    """
+    global _timeout_monitor  # pylint: disable=global-statement
+
+    try:
+        from IPython import get_ipython
+
+        ip = get_ipython()
+        if ip is None:
+            LoggerManager.get_logger("execution_timeout").warning(
+                "IPython instance not available, skipping timeout monitor setup"
+            )
+            return
+
+        _timeout_monitor = ExecutionTimeoutMonitor(
+            warning_threshold_seconds=warning_threshold_seconds,
+            timeout_seconds=timeout_seconds,
+            enable_auto_interrupt=enable_auto_interrupt,
+        )
+
+        # Register event handlers
+        ip.events.register("pre_execute", _timeout_monitor.on_pre_execute)
+        ip.events.register("post_execute", _timeout_monitor.on_post_execute)
+
+        LoggerManager.get_logger("execution_timeout").info(
+            "Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s",
+            warning_threshold_seconds,
+            timeout_seconds,
+            enable_auto_interrupt,
+        )
+
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        LoggerManager.get_logger("execution_timeout").error(
+            "Failed to set up timeout monitor: %s", e
+        )

From 1f819e35f9e2845a4399983f8fe586319bf8c6e0 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 17:59:40 +0100
Subject: [PATCH 3/7] Add execution metadata publishing to webapp

- Add publish_execution_metadata() function
- Define DEEPNOTE_EXECUTION_METADATA_MIME_TYPE constant
- Publish structured execution data via display_pub
- Include duration, success status, and error type in metadata
---
 deepnote_toolkit/ipython_utils.py | 39 +++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/deepnote_toolkit/ipython_utils.py b/deepnote_toolkit/ipython_utils.py
index a789985..b51d749 100644
--- a/deepnote_toolkit/ipython_utils.py
+++ b/deepnote_toolkit/ipython_utils.py
@@ -1,7 +1,12 @@
+from typing import Optional
+
 from IPython import get_ipython
 
 # also defined in https://github.com/deepnote/deepnote/blob/a9f36659f50c84bd85aeba8ee2d3d4458f2f4998/libs/shared/src/constants.ts#L47
 DEEPNOTE_SQL_METADATA_MIME_TYPE = "application/vnd.deepnote.sql-output-metadata+json"
+DEEPNOTE_EXECUTION_METADATA_MIME_TYPE = (
+    "application/vnd.deepnote.execution-metadata+json"
+)
 
 
 def output_display_data(mime_bundle):
@@ -24,3 +29,37 @@ def output_sql_metadata(metadata: dict):
         None
     """
     output_display_data({DEEPNOTE_SQL_METADATA_MIME_TYPE: metadata})
+
+
+def publish_execution_metadata(
+    execution_count: int,
+    duration: float,
+    success: bool,
+    error_type: Optional[str] = None,
+) -> None:
+    """
+    Publish execution metadata to the webapp via display_pub.
+
+    This function publishes structured metadata about cell execution that can be
+    consumed by the webapp for monitoring, debugging, and analytics purposes.
+
+    Args:
+        execution_count (int): The execution count of the cell
+        duration (float): Execution duration in seconds
+        success (bool): Whether the execution completed successfully
+        error_type (str, optional): The type of error if execution failed
+
+    Returns:
+        None
+    """
+    metadata = {
+        "execution_count": execution_count,
+        "duration_seconds": duration,
+        "success": success,
+        "timestamp": duration,  # Using duration as timestamp for now
+    }
+
+    if error_type:
+        metadata["error_type"] = error_type
+
+    output_display_data({DEEPNOTE_EXECUTION_METADATA_MIME_TYPE: metadata})

From 3fa8852c847647163b44791f9dd8a520dabeb843 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 17:59:47 +0100
Subject: [PATCH 4/7] Integrate execution tracking and timeout monitoring into
 runtime

- Import and setup execution tracking during runtime init
- Add optional execution timeout monitor setup
- Configure timeout monitor via environment variables
- Add error handling for both features
- Maintain backward compatibility (timeout monitor disabled by default)
---
 deepnote_toolkit/runtime_initialization.py | 32 ++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/deepnote_toolkit/runtime_initialization.py b/deepnote_toolkit/runtime_initialization.py
index d8a7e3c..bd1b4b9 100644
--- a/deepnote_toolkit/runtime_initialization.py
+++ b/deepnote_toolkit/runtime_initialization.py
@@ -1,6 +1,7 @@
 """ This module contains functions to set up the Deepnote runtime environment. """
 
 import builtins
+import os
 
 import IPython.core.page
 import psycopg2.extensions
@@ -8,6 +9,8 @@
 
 from .dataframe_utils import add_formatters
 from .execute_post_start_hooks import execute_post_start_hooks
+from .execution_timeout import setup_execution_timeout_monitor
+from .execution_tracking import setup_execution_tracking
 from .logging import LoggerManager
 from .output_middleware import add_output_middleware
 from .set_integrations_env import set_integration_env
@@ -44,6 +47,35 @@ def init_deepnote_runtime():
     except Exception as e:  # pylint: disable=broad-exception-caught
         logger.error("Failed to add output middleware with a error: %s", e)
 
+    # Set up execution tracking for monitoring cell execution
+    try:
+        logger.debug("Setting up execution tracking.")
+        setup_execution_tracking()
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        logger.error("Failed to set up execution tracking with a error: %s", e)
+
+    # Set up execution timeout monitor (optional, controlled by env vars)
+    enable_timeout_monitor = os.getenv("DEEPNOTE_ENABLE_EXECUTION_TIMEOUT", "false").lower() == "true"
+    if enable_timeout_monitor:
+        try:
+            warning_threshold = int(os.getenv("DEEPNOTE_EXECUTION_WARNING_THRESHOLD", "240"))
+            timeout_threshold = int(os.getenv("DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD", "300"))
+            auto_interrupt = os.getenv("DEEPNOTE_EXECUTION_AUTO_INTERRUPT", "false").lower() == "true"
+
+            logger.debug(
+                "Setting up execution timeout monitor: warning=%ds, timeout=%ds, auto_interrupt=%s",
+                warning_threshold,
+                timeout_threshold,
+                auto_interrupt,
+            )
+            setup_execution_timeout_monitor(
+                warning_threshold_seconds=warning_threshold,
+                timeout_seconds=timeout_threshold,
+                enable_auto_interrupt=auto_interrupt,
+            )
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error("Failed to set up execution timeout monitor with a error: %s", e)
+
     # Set up psycopg2 to make long-running queries interruptible by SIGINT (interrupt kernel)
     try:
         logger.debug("Setting psycopg2.")

From b525a2017fb5aebfdea70c2cf67d5b23fc1f47e0 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 18:00:01 +0100
Subject: [PATCH 5/7] Add comprehensive documentation for execution tracking
 features

- Document execution tracking and timeout monitoring
- Include configuration examples and environment variables
- Provide debugging guide for stuck executions
- Explain log formats and locations
- Add testing instructions and examples
- List all modified/created files
- Include future enhancement ideas
---
 docs/EXECUTION_TRACKING.md | 237 +++++++++++++++++++++++++++++++++++++
 1 file changed, 237 insertions(+)
 create mode 100644 docs/EXECUTION_TRACKING.md

diff --git a/docs/EXECUTION_TRACKING.md b/docs/EXECUTION_TRACKING.md
new file mode 100644
index 0000000..e88a051
--- /dev/null
+++ b/docs/EXECUTION_TRACKING.md
@@ -0,0 +1,237 @@
+# Execution Tracking and Timeout Monitoring
+
+This document describes the execution tracking and timeout monitoring features added to improve observability and debugging of stuck Jupyter notebook executions.
+
+## Overview
+
+The execution tracking system provides:
+
+1. **Execution Event Tracking**: Logs every cell execution with start/end times, duration, and success status
+2. **Execution Metadata Publishing**: Publishes structured metadata to the webapp for monitoring
+3. **ZMQ Message Flow Logging**: Enhanced debugging of kernel-server communication
+4. **Execution Timeout Monitoring**: Optional monitoring with warnings and auto-interrupt for stuck executions
+
+## Features
+
+### 1. Execution Event Tracking
+
+**Location**: `deepnote_toolkit/execution_tracking.py`
+
+Automatically tracks all notebook cell executions and logs:
+- Execution start with cell preview
+- Execution end with duration and success status
+- Error types if execution fails
+
+**Log Format**:
+```
+EXEC_START | count=1 | cell_id=12345 | preview=import pandas as pd\ndf = pd.read_csv(...
+EXEC_END | count=1 | duration=2.45s | success=True
+```
+
+This is **enabled by default** and requires no configuration.
+
+### 2. Execution Metadata Publishing
+
+**Location**: `deepnote_toolkit/ipython_utils.py`
+
+Publishes execution metadata to the webapp via the `application/vnd.deepnote.execution-metadata+json` MIME type.
+
+**Metadata Structure**:
+```python
+{
+    "execution_count": 1,
+    "duration_seconds": 2.45,
+    "success": True,
+    "error_type": "KeyError"  # Only present if execution failed
+}
+```
+
+This allows the webapp to:
+- Display execution duration in the UI
+- Track execution history
+- Alert on failed executions
+- Build execution analytics
+
+### 3. ZMQ Session Debug Logging
+
+**Location**: `deepnote_core/resources/jupyter/jupyter_server_config.py:817`
+
+Enables detailed ZMQ message logging by setting:
+```python
+c.Session.debug = True
+```
+
+This logs all ZMQ messages exchanged between the Jupyter server and kernel, which is critical for debugging:
+- Message delays
+- Message loss
+- Protocol issues
+- Kernel communication problems
+
+**Enhanced Logging Configuration** (line 418-426):
+```python
+c.ServerApp.logging_config = {
+    "loggers": {
+        "tornado.access": {"level": "DEBUG"},
+        "jupyter_server.serverapp": {"level": "DEBUG"},
+        "jupyter_client.session": {"level": "DEBUG"},
+        "execution_tracker": {"level": "INFO"},
+        "execution_tracking": {"level": "INFO"},
+    }
+}
+```
+
+### 4. Execution Timeout Monitoring (Optional)
+
+**Location**: `deepnote_toolkit/execution_timeout.py`
+
+Optional feature that monitors execution duration and can:
+- Send warnings when executions exceed a threshold
+- Report to the webapp
+- Automatically interrupt stuck executions (if enabled)
+
+**Configuration via Environment Variables**:
+
+```bash
+# Enable timeout monitoring (default: false)
+export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true
+
+# Warning threshold in seconds (default: 240 = 4 minutes)
+export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=240
+
+# Timeout threshold in seconds (default: 300 = 5 minutes)
+export DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD=300
+
+# Enable automatic interrupt of stuck executions (default: false)
+# WARNING: Use with caution! This will send SIGINT to interrupt executions
+export DEEPNOTE_EXECUTION_AUTO_INTERRUPT=false
+```
+
+**Example Usage**:
+```bash
+# Enable with default thresholds
+export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true
+
+# Enable with custom thresholds and auto-interrupt
+export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true
+export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=120  # Warn after 2 minutes
+export DEEPNOTE_EXECUTION_TIMEOUT_THRESHOLD=180  # Timeout after 3 minutes
+export DEEPNOTE_EXECUTION_AUTO_INTERRUPT=true    # Auto-interrupt
+```
+
+## Log Locations
+
+All execution tracking logs are written to the standard Deepnote log location:
+- **File**: `$DEEPNOTE_LOG_DIR/helpers.log` (or XDG default)
+- **Scraped by**: Loki for centralized logging
+
+## Debugging Stuck Executions
+
+When investigating stuck executions, follow this process:
+
+1. **Check Execution Logs**: Look for `EXEC_START` without matching `EXEC_END`
+   ```bash
+   grep "EXEC_START\|EXEC_END" $DEEPNOTE_LOG_DIR/helpers.log
+   ```
+
+2. **Check ZMQ Message Flow**: Look for message delays or missing messages
+   ```bash
+   grep "Session.send\|Session.recv" /var/log/jupyter-server.log
+   ```
+
+3. **Check for Long Executions**: Look for `LONG_EXECUTION` warnings
+   ```bash
+   grep "LONG_EXECUTION" $DEEPNOTE_LOG_DIR/helpers.log
+   ```
+
+4. **Enable Timeout Monitoring**: If not already enabled, set environment variables and restart kernel
+
+## Implementation Details
+
+### Execution Tracking Flow
+
+1. User executes a cell
+2. IPython fires `pre_execute` event → `ExecutionTracker.on_pre_execute()`
+   - Logs `EXEC_START`
+   - Records start time and cell metadata
+3. Cell executes...
+4. IPython fires `post_execute` event → `ExecutionTracker.on_post_execute()`
+   - Calculates duration
+   - Logs `EXEC_END` with duration and success status
+   - Publishes metadata to webapp
+
+### Timeout Monitoring Flow (if enabled)
+
+1. `pre_execute` event → `ExecutionTimeoutMonitor.on_pre_execute()`
+   - Starts warning timer (default: 240s)
+   - Starts timeout timer (default: 300s) if auto-interrupt enabled
+2. If warning timer fires → `_send_warning()`
+   - Logs `LONG_EXECUTION` warning
+   - Reports to webapp
+3. If timeout timer fires → `_interrupt_execution()`
+   - Logs `TIMEOUT_INTERRUPT` error
+   - Reports to webapp
+   - Sends SIGINT to kernel (if auto-interrupt enabled)
+4. `post_execute` event → cancels all timers
+
+## Files Modified/Created
+
+### New Files
+- `deepnote_toolkit/execution_tracking.py` - Execution tracking implementation
+- `deepnote_toolkit/execution_timeout.py` - Timeout monitoring implementation
+- `docs/EXECUTION_TRACKING.md` - This documentation
+
+### Modified Files
+- `deepnote_toolkit/runtime_initialization.py` - Added tracking and timeout setup
+- `deepnote_toolkit/ipython_utils.py` - Added execution metadata publishing
+- `deepnote_core/resources/jupyter/jupyter_server_config.py` - Enhanced logging config
+
+## Testing
+
+To test the execution tracking:
+
+1. Start a Jupyter kernel with the toolkit
+2. Execute a simple cell:
+   ```python
+   print("Hello, world!")
+   ```
+3. Check logs for execution tracking:
+   ```bash
+   tail -f $DEEPNOTE_LOG_DIR/helpers.log | grep "EXEC_"
+   ```
+4. You should see:
+   ```
+   EXEC_START | count=1 | cell_id=... | preview=print("Hello, world!")
+   EXEC_END | count=1 | duration=0.01s | success=True
+   ```
+
+To test timeout monitoring:
+
+1. Enable timeout monitoring:
+   ```bash
+   export DEEPNOTE_ENABLE_EXECUTION_TIMEOUT=true
+   export DEEPNOTE_EXECUTION_WARNING_THRESHOLD=5  # 5 seconds for testing
+   ```
+2. Restart kernel
+3. Execute a long-running cell:
+   ```python
+   import time
+   time.sleep(10)
+   ```
+4. After 5 seconds, you should see a `LONG_EXECUTION` warning in the logs
+
+## Future Enhancements
+
+Potential improvements to consider:
+
+1. **Kernel Heartbeat Monitoring**: Monitor ZMQ heartbeat channel to detect dead kernels
+2. **Message Queue Metrics**: Track IOPub message queue depth to detect backpressure
+3. **Execution History**: Store execution history in database for analytics
+4. **WebApp Integration**: Display execution metrics in real-time in the UI
+5. **Adaptive Timeouts**: Adjust timeout thresholds based on historical execution patterns
+6. **Execution Profiling**: Integrate with Python profilers for detailed performance analysis
+
+## References
+
+- Jupyter Client Protocol: https://jupyter-client.readthedocs.io/en/stable/messaging.html
+- IPython Events: https://ipython.readthedocs.io/en/stable/config/callbacks.html
+- ZMQ Guide: https://zguide.zeromq.org/

From 4020118cfa59fbad863215a0cabc32518e273665 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 18:05:21 +0100
Subject: [PATCH 6/7] Fix LoggerManager and webapp URL imports

- Fix LoggerManager usage: use LoggerManager().get_logger() instead of LoggerManager.get_logger()
- Fix webapp URL import: use get_absolute_userpod_api_url() instead of non-existent get_webapp_url()
- All imports and functionality tests now pass
---
 deepnote_toolkit/execution_timeout.py  | 25 +++++++------------------
 deepnote_toolkit/execution_tracking.py |  8 ++++----
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py
index 19945c1..66d61cd 100644
--- a/deepnote_toolkit/execution_timeout.py
+++ b/deepnote_toolkit/execution_timeout.py
@@ -11,7 +11,7 @@
 import requests
 from IPython.core.interactiveshell import ExecutionInfo, ExecutionResult
 
-from .get_webapp_url import get_webapp_url
+from .get_webapp_url import get_absolute_userpod_api_url
 from .logging import LoggerManager
 
 
@@ -34,7 +34,7 @@ def __init__(
             timeout_seconds: Seconds after which to consider execution stuck (default: 300s = 5min)
             enable_auto_interrupt: Whether to automatically interrupt stuck executions (default: False)
         """
-        self.logger = LoggerManager.get_logger("execution_timeout")
+        self.logger = LoggerManager().get_logger()
         self.warning_threshold = warning_threshold_seconds
         self.timeout_threshold = timeout_seconds
         self.enable_auto_interrupt = enable_auto_interrupt
@@ -152,19 +152,8 @@ def _report_to_webapp(
             warning: Whether this is a warning (True) or timeout (False)
         """
         try:
-            webapp_url = get_webapp_url()
-            project_id = os.getenv("DEEPNOTE_PROJECT_ID")
-
-            if not webapp_url or not project_id:
-                self.logger.debug(
-                    "Webapp URL or project ID not available, skipping report"
-                )
-                return
-
-            endpoint = (
-                "warning" if warning else "timeout"
-            )
-            url = f"{webapp_url}/userpod-api/{project_id}/execution/{endpoint}"
+            endpoint = "warning" if warning else "timeout"
+            url = get_absolute_userpod_api_url(f"execution/{endpoint}")
 
             payload = {
                 "duration": duration,
@@ -209,7 +198,7 @@ def setup_execution_timeout_monitor(
 
         ip = get_ipython()
         if ip is None:
-            LoggerManager.get_logger("execution_timeout").warning(
+            LoggerManager().get_logger().warning(
                 "IPython instance not available, skipping timeout monitor setup"
             )
             return
@@ -224,7 +213,7 @@ def setup_execution_timeout_monitor(
         ip.events.register("pre_execute", _timeout_monitor.on_pre_execute)
         ip.events.register("post_execute", _timeout_monitor.on_post_execute)
 
-        LoggerManager.get_logger("execution_timeout").info(
+        LoggerManager().get_logger().info(
             "Execution timeout monitor initialized: warning=%ds, timeout=%ds, auto_interrupt=%s",
             warning_threshold_seconds,
             timeout_seconds,
@@ -232,6 +221,6 @@ def setup_execution_timeout_monitor(
         )
 
     except Exception as e:  # pylint: disable=broad-exception-caught
-        LoggerManager.get_logger("execution_timeout").error(
+        LoggerManager().get_logger().error(
             "Failed to set up timeout monitor: %s", e
         )
diff --git a/deepnote_toolkit/execution_tracking.py b/deepnote_toolkit/execution_tracking.py
index 3afe4e5..3ac5b39 100644
--- a/deepnote_toolkit/execution_tracking.py
+++ b/deepnote_toolkit/execution_tracking.py
@@ -17,7 +17,7 @@ class ExecutionTracker:
     """Tracks execution state of notebook cells."""
 
     def __init__(self):
-        self.logger = LoggerManager.get_logger("execution_tracker")
+        self.logger = LoggerManager().get_logger()
         self.current_execution: Optional[Dict[str, Any]] = None
         self.execution_count = 0
 
@@ -109,7 +109,7 @@ def setup_execution_tracking() -> None:
 
         ip = get_ipython()
         if ip is None:
-            LoggerManager.get_logger("execution_tracking").warning(
+            LoggerManager().get_logger().warning(
                 "IPython instance not available, skipping execution tracking setup"
             )
             return
@@ -122,11 +122,11 @@ def setup_execution_tracking() -> None:
         ip.events.register("pre_run_cell", _execution_tracker.on_pre_run_cell)
         ip.events.register("post_run_cell", _execution_tracker.on_post_run_cell)
 
-        LoggerManager.get_logger("execution_tracking").info(
+        LoggerManager().get_logger().info(
             "Execution tracking initialized successfully"
         )
 
     except Exception as e:  # pylint: disable=broad-exception-caught
-        LoggerManager.get_logger("execution_tracking").error(
+        LoggerManager().get_logger().error(
             "Failed to set up execution tracking: %s", e
         )

From 8063f60cec2d327cff99b1a14607920e1a657da8 Mon Sep 17 00:00:00 2001
From: Hannes Probst <hannes@deepnote.com>
Date: Fri, 14 Nov 2025 18:18:36 +0100
Subject: [PATCH 7/7] Fix blocking issues from PR review

- Fix incorrect timestamp: use time.time() instead of duration in metadata
- Move time imports to module level in execution_timeout.py
- Add threading lock to fix race condition in timeout monitoring
  - Protect current_execution access with lock
  - Copy execution data before processing outside lock
- All fixes validated and tested
---
 deepnote_toolkit/execution_timeout.py | 79 ++++++++++++++-------------
 deepnote_toolkit/ipython_utils.py     |  3 +-
 2 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/deepnote_toolkit/execution_timeout.py b/deepnote_toolkit/execution_timeout.py
index 66d61cd..b764734 100644
--- a/deepnote_toolkit/execution_timeout.py
+++ b/deepnote_toolkit/execution_timeout.py
@@ -6,6 +6,7 @@
 import os
 import signal
 import threading
+import time
 from typing import Optional
 
 import requests
@@ -41,36 +42,36 @@ def __init__(
         self.current_execution: Optional[dict] = None
         self.warning_timer: Optional[threading.Timer] = None
         self.timeout_timer: Optional[threading.Timer] = None
+        self._execution_lock = threading.Lock()
 
     def on_pre_execute(self, info: ExecutionInfo) -> None:
         """
         Called before executing a cell.
         Starts timers for warning and timeout.
         """
-        import time
-
         cell_preview = info.raw_cell[:100] if info.raw_cell else "<empty>"
 
-        self.current_execution = {
-            "code": cell_preview,
-            "start": time.time(),
-        }
-
-        # Start warning timer
-        if self.warning_threshold > 0:
-            self.warning_timer = threading.Timer(
-                self.warning_threshold, self._send_warning
-            )
-            self.warning_timer.daemon = True
-            self.warning_timer.start()
+        with self._execution_lock:
+            self.current_execution = {
+                "code": cell_preview,
+                "start": time.time(),
+            }
 
-        # Start timeout timer
-        if self.enable_auto_interrupt and self.timeout_threshold > 0:
-            self.timeout_timer = threading.Timer(
-                self.timeout_threshold, self._interrupt_execution
-            )
-            self.timeout_timer.daemon = True
-            self.timeout_timer.start()
+            # Start warning timer
+            if self.warning_threshold > 0:
+                self.warning_timer = threading.Timer(
+                    self.warning_threshold, self._send_warning
+                )
+                self.warning_timer.daemon = True
+                self.warning_timer.start()
+
+            # Start timeout timer
+            if self.enable_auto_interrupt and self.timeout_threshold > 0:
+                self.timeout_timer = threading.Timer(
+                    self.timeout_threshold, self._interrupt_execution
+                )
+                self.timeout_timer.daemon = True
+                self.timeout_timer.start()
 
         self.logger.debug(
             "Timeout monitoring started: warning=%ds, timeout=%ds, auto_interrupt=%s",
@@ -84,8 +85,9 @@ def on_post_execute(self, result: ExecutionResult) -> None:
         Called after executing a cell.
         Cancels any pending timers.
         """
-        self._cancel_timers()
-        self.current_execution = None
+        with self._execution_lock:
+            self._cancel_timers()
+            self.current_execution = None
 
     def _cancel_timers(self) -> None:
         """Cancel all active timers."""
@@ -98,13 +100,15 @@ def _cancel_timers(self) -> None:
 
     def _send_warning(self) -> None:
         """Send warning when execution is running longer than threshold."""
-        if not self.current_execution:
-            return
-
-        import time
+        # Capture execution data while holding lock
+        with self._execution_lock:
+            if not self.current_execution:
+                return
+            execution_data = self.current_execution.copy()
 
-        duration = time.time() - self.current_execution["start"]
-        code_preview = self.current_execution["code"][:50]
+        # Process outside lock to avoid blocking
+        duration = time.time() - execution_data["start"]
+        code_preview = execution_data["code"][:50]
 
         self.logger.warning(
             "LONG_EXECUTION | duration=%.1fs | preview=%s",
@@ -117,12 +121,15 @@ def _send_warning(self) -> None:
 
     def _interrupt_execution(self) -> None:
         """Interrupt execution after timeout threshold is exceeded."""
-        if not self.current_execution:
-            return
+        # Capture execution data while holding lock
+        with self._execution_lock:
+            if not self.current_execution:
+                return
+            execution_data = self.current_execution.copy()
 
-        import time
-
-        duration = time.time() - self.current_execution["start"]
+        # Process outside lock to avoid blocking
+        duration = time.time() - execution_data["start"]
+        code_preview = execution_data["code"][:50]
 
         self.logger.error(
             "TIMEOUT_INTERRUPT | duration=%.1fs | Sending SIGINT to interrupt execution",
@@ -130,9 +137,7 @@ def _interrupt_execution(self) -> None:
         )
 
         # Report to webapp before interrupting
-        self._report_to_webapp(
-            duration, self.current_execution["code"][:50], warning=False
-        )
+        self._report_to_webapp(duration, code_preview, warning=False)
 
         # Send SIGINT to interrupt the execution (simulates Ctrl+C)
         try:
diff --git a/deepnote_toolkit/ipython_utils.py b/deepnote_toolkit/ipython_utils.py
index b51d749..67813fc 100644
--- a/deepnote_toolkit/ipython_utils.py
+++ b/deepnote_toolkit/ipython_utils.py
@@ -1,3 +1,4 @@
+import time
 from typing import Optional
 
 from IPython import get_ipython
@@ -56,7 +57,7 @@ def publish_execution_metadata(
         "execution_count": execution_count,
         "duration_seconds": duration,
         "success": success,
-        "timestamp": duration,  # Using duration as timestamp for now
+        "timestamp": time.time(),
     }
 
     if error_type: