# Scheduling

> Resource scheduling policies for plugin execution

In [None]:
#| default_exp core.scheduling

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import time
import asyncio
from abc import ABC, abstractmethod
from typing import Dict, Any, Callable, Awaitable, Set

from cjm_plugin_system.core.metadata import PluginMeta

import logging
logging.basicConfig(
    level=logging.INFO,
    format='[ResourceScheduler] %(message)s',
    force=True
)

## ResourceScheduler

Abstract base class that defines the scheduling policy interface. Schedulers decide whether a plugin can execute based on its resource requirements and current system state.

The separation of **Mechanism** (plugins reporting stats) from **Policy** (schedulers deciding allocation) allows the same plugin ecosystem to serve:

- **Interactive web apps**: Use `SafetyScheduler` to prevent OOM crashes
- **Batch processing**: Use `QueueScheduler` to wait for resources
- **Development**: Use `PermissiveScheduler` to run everything

In [None]:
#| export
class ResourceScheduler(ABC):
    """Abstract base class for resource allocation policies."""
    
    @abstractmethod
    def allocate(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Dict[str, Any]]  # Function that returns fresh stats
    ) -> bool:  # True if execution is allowed
        """Decide if a plugin can start based on its requirements and system state."""
        ...

    async def allocate_async(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Awaitable[Dict[str, Any]]]  # Async function returning stats
    ) -> bool:  # True if execution is allowed
        """Async allocation decision. Default delegates to sync allocate after fetching stats once."""
        stats = await stats_provider()
        return self.allocate(plugin_meta, lambda: stats)

    @abstractmethod
    def on_execution_start(
        self,
        plugin_name: str  # Name of the plugin starting execution
    ) -> None:
        """Notify scheduler that a task started (to reserve resources)."""
        ...

    @abstractmethod
    def on_execution_finish(
        self,
        plugin_name: str  # Name of the plugin finishing execution
    ) -> None:
        """Notify scheduler that a task finished (to release resources)."""
        ...

## PermissiveScheduler

Default scheduler that allows all executions. Use this for development, scripting, and batch processing where you want maximum throughput without safety checks.

In [None]:
#| export
class PermissiveScheduler(ResourceScheduler):
    """Scheduler that allows all executions (Default / Dev Mode)."""
    
    def allocate(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Dict[str, Any]]  # Stats provider (ignored)
    ) -> bool:  # Always returns True
        """Allow all plugin executions without checking resources."""
        return True

    def on_execution_start(
        self,
        plugin_name: str  # Name of the plugin starting execution
    ) -> None:
        """No-op for permissive scheduler."""
        pass

    def on_execution_finish(
        self,
        plugin_name: str  # Name of the plugin finishing execution
    ) -> None:
        """No-op for permissive scheduler."""
        pass

## SafetyScheduler

Production scheduler that blocks execution if resources are insufficient. Checks GPU VRAM and system RAM against plugin requirements defined in the manifest.

Resource requirements are read from the plugin manifest:

```json
{
  "resources": {
    "requires_gpu": true,
    "min_gpu_vram_mb": 4096,
    "min_system_ram_mb": 8192
  }
}
```

System stats are provided by a System Monitor plugin implementing the `cjm-infra-plugin-system` interface.

In [None]:
#| export
class SafetyScheduler(ResourceScheduler):
    """Scheduler that prevents execution if resources are insufficient."""
    
    def _check_resources(
        self,
        plugin_meta: PluginMeta,  # Plugin metadata with manifest
        stats: Dict[str, Any]  # Current system stats
    ) -> bool:  # True if resources available
        """Check if system has sufficient resources for the plugin."""
        reqs = {}
        if hasattr(plugin_meta, 'manifest'):
            reqs = plugin_meta.manifest.get('resources', {})
            
        if not reqs:
            return True  # No requirements defined

        # Check GPU Memory
        if reqs.get('requires_gpu', False):
            needed_vram = reqs.get('min_gpu_vram_mb', 0)
            available_vram = stats.get('gpu_free_memory_mb')
            
            if available_vram is None:
                print("[Scheduler] Warning: No GPU stats available.")
                return True

            if needed_vram > available_vram:
                print(f"[Scheduler] Blocked {plugin_meta.name}: Needs {needed_vram}MB VRAM, has {available_vram}MB")
                return False
                
        # Check System RAM
        needed_ram = reqs.get('min_system_ram_mb', 0)
        available_ram = stats.get('memory_available_mb')
        
        if available_ram is not None and needed_ram > available_ram:
            print(f"[Scheduler] Blocked {plugin_meta.name}: Needs {needed_ram}MB RAM, has {available_ram}MB")
            return False
            
        return True
    
    def allocate(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Dict[str, Any]]  # Function returning current stats
    ) -> bool:  # True if resources are available
        """Check resource requirements against system state."""
        return self._check_resources(plugin_meta, stats_provider())

    def on_execution_start(
        self,
        plugin_name: str  # Name of the plugin starting execution
    ) -> None:
        """Called when execution starts (for future resource reservation)."""
        pass

    def on_execution_finish(
        self,
        plugin_name: str  # Name of the plugin finishing execution
    ) -> None:
        """Called when execution finishes (for future resource release)."""
        pass

## QueueScheduler

Batch processing scheduler that waits for resources to become available. Polls the system monitor until resources are free or timeout is reached.

- **Sync path**: Uses `time.sleep()` for blocking wait (scripts, batch jobs)
- **Async path**: Uses `await asyncio.sleep()` for non-blocking wait (FastHTML, async apps)
- **Active tracking**: Tracks which plugins have running executions via `get_active_plugins()`

The active plugin tracking enables smart eviction: plugins that are currently executing should never be evicted, while idle plugins can be safely released to free resources.

In [None]:
#| export
class QueueScheduler(ResourceScheduler):
    """Scheduler that waits for resources to become available."""
    
    def __init__(
        self,
        timeout: float = 300.0,  # Max seconds to wait for resources
        poll_interval: float = 2.0  # Seconds between resource checks
    ):
        """Initialize queue scheduler with timeout and polling settings."""
        self.timeout = timeout
        self.poll_interval = poll_interval
        self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
        self._active_plugins: Set[str] = set()  # Track plugins with running executions

    def _check_resources(
        self,
        plugin_meta: PluginMeta,  # Plugin metadata with manifest
        stats: Dict[str, Any]  # Current system stats
    ) -> bool:  # True if resources available
        """Check if system has sufficient resources for the plugin."""
        reqs = {}
        if hasattr(plugin_meta, 'manifest'):
            reqs = plugin_meta.manifest.get('resources', {})
            
        if not reqs:
            return True

        # Check GPU Memory
        if reqs.get('requires_gpu', False):
            needed_vram = reqs.get('min_gpu_vram_mb', 0)
            available_vram = stats.get('gpu_free_memory_mb')
            if available_vram is not None and needed_vram > available_vram:
                return False
                
        # Check System RAM
        needed_ram = reqs.get('min_system_ram_mb', 0)
        available_ram = stats.get('memory_available_mb')
        if available_ram is not None and needed_ram > available_ram:
            return False
            
        return True
    
    def allocate(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Dict[str, Any]]  # Function returning current stats
    ) -> bool:  # True if resources become available before timeout
        """Wait for resources using blocking sleep."""
        start_time = time.time()
        
        while True:
            stats = stats_provider()
            if self._check_resources(plugin_meta, stats):
                return True
            
            if time.time() - start_time > self.timeout:
                self.logger.error(f"Timeout waiting for resources for {plugin_meta.name}")
                return False
                
            self.logger.info(f"Resources busy. Waiting {self.poll_interval}s...")
            time.sleep(self.poll_interval)

    async def allocate_async(
        self,
        plugin_meta: PluginMeta,  # Metadata of the plugin requesting resources
        stats_provider: Callable[[], Awaitable[Dict[str, Any]]]  # Async stats function
    ) -> bool:  # True if resources become available before timeout
        """Wait for resources using non-blocking async sleep."""
        start_time = time.time()
        
        while True:
            stats = await stats_provider()
            if self._check_resources(plugin_meta, stats):
                return True
            
            if time.time() - start_time > self.timeout:
                self.logger.error(f"Timeout waiting for resources for {plugin_meta.name}")
                return False
                
            self.logger.info(f"Resources busy. Yielding {self.poll_interval}s...")
            await asyncio.sleep(self.poll_interval)

    def on_execution_start(
        self,
        plugin_name: str  # Name of the plugin starting execution
    ) -> None:
        """Track that a plugin has started executing."""
        self._active_plugins.add(plugin_name)

    def on_execution_finish(
        self,
        plugin_name: str  # Name of the plugin finishing execution
    ) -> None:
        """Track that a plugin has finished executing."""
        self._active_plugins.discard(plugin_name)

    def get_active_plugins(self) -> Set[str]:  # Set of currently executing plugin names
        """Get the set of plugins with active executions."""
        return self._active_plugins.copy()

## Usage Examples

### Development Mode (Default)

```python
from cjm_plugin_system.core.manager import PluginManager

# Default: PermissiveScheduler allows everything
manager = PluginManager()
```

### Production Mode with Safety Checks

```python
from cjm_plugin_system.core.manager import PluginManager
from cjm_plugin_system.core.scheduling import SafetyScheduler

# Create manager with safety scheduler
manager = PluginManager(scheduler=SafetyScheduler())

# Load plugins
manager.load_all()

# Register a system monitor plugin for real-time stats
manager.register_system_monitor("sys-mon-nvidia")

# Execution will now check resources before running
try:
    result = manager.execute_plugin("whisper-local", audio="/path/to/audio.wav")
except RuntimeError as e:
    print(f"Blocked: {e}")
```

### Batch Processing with Queue

```python
from cjm_plugin_system.core.manager import PluginManager
from cjm_plugin_system.core.scheduling import QueueScheduler

# Create manager with queue scheduler (waits up to 5 minutes)
manager = PluginManager(scheduler=QueueScheduler(timeout=300.0, poll_interval=5.0))
manager.load_all()
manager.register_system_monitor("sys-mon-nvidia")

# Will block until resources are available or timeout
result = manager.execute_plugin("whisper-local", audio="/path/to/audio.wav")
```

### Async App with Queue (FastHTML)

```python
from cjm_plugin_system.core.manager import PluginManager
from cjm_plugin_system.core.scheduling import QueueScheduler

manager = PluginManager(scheduler=QueueScheduler())
manager.load_all()
manager.register_system_monitor("sys-mon-nvidia")

# Non-blocking wait using asyncio.sleep
result = await manager.execute_plugin_async("whisper-local", audio="/path/to/audio.wav")
```

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()