# Job Queue

> Resource-aware job queue for sequential plugin execution with cancellation support

In [None]:
#| default_exp core.queue

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import asyncio
import heapq
import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, TYPE_CHECKING

from cjm_plugin_system.core.manager import PluginManager

import logging
logging.basicConfig(
    level=logging.INFO,
    format='[JobQueue] %(message)s',
    force=True
)

The `JobQueue` provides a resource-aware job queue for plugin execution:

```
┌─────────────────────────────────────────────────────────────┐
│                    User Application                         │
│  ┌─────────────────────────────────────────────────────┐    │
│  │                    JobQueue                         │    │
│  │  ┌───────────┐  ┌───────────┐  ┌───────────────┐    │    │
│  │  │ Pending   │  │ Running   │  │ History       │    │    │
│  │  │ Jobs      │→ │ Job       │→ │ (completed/   │    │    │
│  │  │ (heap)    │  │           │  │  cancelled)   │    │    │
│  │  └───────────┘  └───────────┘  └───────────────┘    │    │
│  └─────────────────────────────────────────────────────┘    │
│         │                │                                  │
│         ▼                ▼                                  │
│  ┌─────────────┐  ┌─────────────┐                           │
│  │ Scheduler   │  │ Plugin      │                           │
│  │ (policy)    │  │ Manager     │                           │
│  └─────────────┘  └─────────────┘                           │
└─────────────────────────────────────────────────────────────┘
```

Key features:

- **Priority queue**: Higher priority jobs execute first (FIFO within same priority)
- **Resource-aware**: Waits for resources, triggers eviction of idle plugins
- **Cancellation**: Cancel pending or running jobs with graceful fallback to force termination
- **Progress tracking**: Poll progress and status messages during execution
- **Observable**: Get queue state for UI integration

## JobStatus

Enumeration of possible job states.

In [None]:
#| export
class JobStatus(str, Enum):
    """Status of a job in the queue."""
    pending = "pending"
    running = "running"
    completed = "completed"
    failed = "failed"
    cancelled = "cancelled"

## Job

Dataclass representing a queued plugin execution.

In [None]:
#| export
@dataclass
class Job:
    """A queued plugin execution request."""
    id: str  # Unique job identifier (UUID)
    plugin_name: str  # Target plugin name
    args: Tuple[Any, ...]  # Positional arguments for execute()
    kwargs: Dict[str, Any]  # Keyword arguments for execute()
    status: JobStatus = JobStatus.pending  # Current job status
    priority: int = 0  # Higher = more urgent
    created_at: float = field(default_factory=time.time)  # Submission timestamp
    started_at: Optional[float] = None  # Execution start timestamp
    completed_at: Optional[float] = None  # Completion timestamp
    result: Any = None  # Execution result (if completed)
    error: Optional[str] = None  # Error message (if failed)
    progress: float = 0.0  # 0.0 to 1.0, or -1.0 for indeterminate
    status_message: str = ""  # Descriptive status message

    def __lt__(self, other: 'Job') -> bool:
        """Compare jobs for priority queue (higher priority first, then FIFO)."""
        # Negate priority so higher values come first in min-heap
        # Use created_at as tiebreaker (earlier first)
        return (-self.priority, self.created_at) < (-other.priority, other.created_at)

## JobQueue

Main queue class that manages job submission, execution, and lifecycle.

In [None]:
#| export
class JobQueue:
    """Resource-aware job queue for sequential plugin execution."""
    
    def __init__(
        self,
        manager: PluginManager,  # Plugin manager instance
        max_history: int = 100,  # Max completed jobs to retain
        cancel_timeout: float = 3.0,  # Seconds to wait for cooperative cancel
        progress_poll_interval: float = 1.0  # Seconds between progress polls
    ):
        """Initialize the job queue."""
        self.manager = manager
        self.max_history = max_history
        self.cancel_timeout = cancel_timeout
        self.progress_poll_interval = progress_poll_interval
        self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
        
        # State
        self._pending: List[Job] = []  # Priority heap
        self._running: Optional[Job] = None
        self._history: List[Job] = []  # Completed/cancelled/failed jobs
        self._jobs: Dict[str, Job] = {}  # All jobs by ID for fast lookup
        
        # Synchronization
        self._lock = asyncio.Lock()
        self._job_available = asyncio.Event()
        self._job_completed_events: Dict[str, asyncio.Event] = {}
        
        # Background task
        self._processor_task: Optional[asyncio.Task] = None
        self._running_flag = False
        self._cancel_requested: Set[str] = set()  # Job IDs pending cancellation

### Job Submission

In [None]:
#| export
async def submit(
    self,
    plugin_name: str,  # Target plugin
    *args,
    priority: int = 0,  # Higher = more urgent
    **kwargs
) -> str:  # Returns job_id
    """Submit a job to the queue."""
    job_id = str(uuid.uuid4())
    job = Job(
        id=job_id,
        plugin_name=plugin_name,
        args=args,
        kwargs=kwargs,
        priority=priority
    )
    
    async with self._lock:
        self._jobs[job_id] = job
        heapq.heappush(self._pending, job)
        self._job_completed_events[job_id] = asyncio.Event()
        self._job_available.set()
    
    self.logger.info(f"Submitted job {job_id[:8]} for {plugin_name} (priority={priority})")
    return job_id

JobQueue.submit = submit

### Job Control

In [None]:
#| export
async def cancel(
    self,
    job_id: str  # Job to cancel
) -> bool:  # True if cancelled
    """Cancel a pending or running job."""
    async with self._lock:
        job = self._jobs.get(job_id)
        if not job:
            return False
        
        if job.status == JobStatus.pending:
            # Remove from pending queue
            job.status = JobStatus.cancelled
            job.completed_at = time.time()
            self._pending = [j for j in self._pending if j.id != job_id]
            heapq.heapify(self._pending)
            self._move_to_history(job)
            self._signal_job_completed(job_id)
            self.logger.info(f"Cancelled pending job {job_id[:8]}")
            return True
        
        elif job.status == JobStatus.running:
            # Signal cancellation request
            self._cancel_requested.add(job_id)
            self.logger.info(f"Cancellation requested for running job {job_id[:8]}")
            return True
        
        return False  # Already completed/cancelled/failed

def reorder(
    self,
    job_id: str,  # Job to move
    new_priority: int  # New priority value
) -> bool:  # True if reordered
    """Change the priority of a pending job."""
    job = self._jobs.get(job_id)
    if not job or job.status != JobStatus.pending:
        return False
    
    job.priority = new_priority
    heapq.heapify(self._pending)
    self.logger.info(f"Reordered job {job_id[:8]} to priority {new_priority}")
    return True

JobQueue.cancel = cancel
JobQueue.reorder = reorder

### Observation

In [None]:
#| export
def get_job(
    self,
    job_id: str  # Job to retrieve
) -> Optional[Job]:  # Job or None
    """Get a job by ID."""
    return self._jobs.get(job_id)

async def wait_for_job(
    self,
    job_id: str,  # Job to wait for
    timeout: Optional[float] = None  # Max seconds to wait
) -> Job:  # Completed/failed/cancelled job
    """Wait for a job to complete."""
    event = self._job_completed_events.get(job_id)
    if not event:
        job = self._jobs.get(job_id)
        if job and job.status in (JobStatus.completed, JobStatus.failed, JobStatus.cancelled):
            return job
        raise ValueError(f"Unknown job: {job_id}")
    
    try:
        await asyncio.wait_for(event.wait(), timeout=timeout)
    except asyncio.TimeoutError:
        raise TimeoutError(f"Timeout waiting for job {job_id}")
    
    return self._jobs[job_id]

def get_state(self) -> Dict[str, Any]:  # Queue state for UI
    """Get the current queue state."""
    running_dict = None
    if self._running:
        running_dict = {
            "id": self._running.id,
            "plugin_name": self._running.plugin_name,
            "status": self._running.status.value,
            "started_at": self._running.started_at,
            "progress": self._running.progress,
            "status_message": self._running.status_message
        }
    
    pending_list = [
        {
            "id": job.id,
            "plugin_name": job.plugin_name,
            "priority": job.priority,
            "position": i
        }
        for i, job in enumerate(sorted(self._pending))
    ]
    
    history_list = [
        {
            "id": job.id,
            "plugin_name": job.plugin_name,
            "status": job.status.value,
            "completed_at": job.completed_at,
            "error": job.error
        }
        for job in reversed(self._history[-20:])  # Last 20 for UI
    ]
    
    return {
        "running": running_dict,
        "pending": pending_list,
        "history": history_list,
        "stats": {
            "total_pending": len(self._pending),
            "total_completed": sum(1 for j in self._history if j.status == JobStatus.completed),
            "total_failed": sum(1 for j in self._history if j.status == JobStatus.failed),
            "total_cancelled": sum(1 for j in self._history if j.status == JobStatus.cancelled)
        }
    }

def get_job_logs(
    self,
    job_id: str,  # Job to get logs for
    lines: int = 100  # Max lines to return
) -> str:  # Log content
    """Get logs for a job from the plugin's log file."""
    job = self._jobs.get(job_id)
    if not job:
        return ""
    
    return self.manager.get_plugin_logs(job.plugin_name, lines=lines)

JobQueue.get_job = get_job
JobQueue.wait_for_job = wait_for_job
JobQueue.get_state = get_state
JobQueue.get_job_logs = get_job_logs

### Lifecycle

In [None]:
#| export
async def start(self) -> None:
    """Start the queue processor."""
    if self._running_flag:
        return
    
    self._running_flag = True
    self._processor_task = asyncio.create_task(self._process_loop())
    self.logger.info("Job queue started")

async def stop(self) -> None:
    """Stop the queue processor gracefully."""
    self._running_flag = False
    self._job_available.set()  # Wake up the processor
    
    if self._processor_task:
        try:
            await asyncio.wait_for(self._processor_task, timeout=5.0)
        except asyncio.TimeoutError:
            self._processor_task.cancel()
        self._processor_task = None
    
    self.logger.info("Job queue stopped")

JobQueue.start = start
JobQueue.stop = stop

### Internal Methods

In [None]:
#| export
def _move_to_history(self, job: Job) -> None:
    """Move a job to history, maintaining max_history limit."""
    self._history.append(job)
    if len(self._history) > self.max_history:
        old_job = self._history.pop(0)
        # Clean up completion event
        self._job_completed_events.pop(old_job.id, None)

def _signal_job_completed(self, job_id: str) -> None:
    """Signal that a job has completed."""
    event = self._job_completed_events.get(job_id)
    if event:
        event.set()

async def _process_loop(self) -> None:
    """Main processing loop."""
    while self._running_flag:
        # Wait for jobs
        await self._job_available.wait()
        
        if not self._running_flag:
            break
        
        async with self._lock:
            if not self._pending:
                self._job_available.clear()
                continue
            
            job = heapq.heappop(self._pending)
            if not self._pending:
                self._job_available.clear()
        
        # Process the job
        await self._execute_job(job)

async def _execute_job(self, job: Job) -> None:
    """Execute a single job."""
    self.logger.info(f"Starting job {job.id[:8]} ({job.plugin_name})")
    
    # Mark as running
    job.status = JobStatus.running
    job.started_at = time.time()
    self._running = job
    
    try:
        # Get the plugin proxy
        plugin = self.manager.get_plugin(job.plugin_name)
        if not plugin:
            raise ValueError(f"Plugin not loaded: {job.plugin_name}")
        
        # Start progress polling task
        progress_task = asyncio.create_task(
            self._poll_progress(job, plugin)
        )
        
        # Execute with cancellation support
        try:
            result = await self._execute_with_cancellation(job, plugin)
            
            if job.id in self._cancel_requested:
                # Job was cancelled
                job.status = JobStatus.cancelled
                self._cancel_requested.discard(job.id)
            else:
                job.status = JobStatus.completed
                job.result = result
                
        except asyncio.CancelledError:
            job.status = JobStatus.cancelled
        except Exception as e:
            job.status = JobStatus.failed
            job.error = str(e)
            self.logger.error(f"Job {job.id[:8]} failed: {e}")
        finally:
            progress_task.cancel()
            try:
                await progress_task
            except asyncio.CancelledError:
                pass
                
    except Exception as e:
        job.status = JobStatus.failed
        job.error = str(e)
        self.logger.error(f"Job {job.id[:8]} failed: {e}")
    
    # Finalize
    job.completed_at = time.time()
    job.progress = 1.0 if job.status == JobStatus.completed else job.progress
    self._running = None
    
    async with self._lock:
        self._move_to_history(job)
        self._signal_job_completed(job.id)
    
    self.logger.info(f"Job {job.id[:8]} {job.status.value}")

async def _execute_with_cancellation(
    self,
    job: Job,
    plugin: Any
) -> Any:
    """Execute job with cancellation monitoring."""
    # Start execution
    exec_task = asyncio.create_task(
        self.manager.execute_plugin_async(job.plugin_name, *job.args, **job.kwargs)
    )
    
    # Monitor for cancellation
    while not exec_task.done():
        if job.id in self._cancel_requested:
            # Try cooperative cancellation first
            self.logger.info(f"Attempting cooperative cancel for {job.id[:8]}")
            
            if hasattr(plugin, 'cancel_async'):
                await plugin.cancel_async()
            elif hasattr(plugin, 'cancel'):
                plugin.cancel()
            
            # Wait briefly for cooperative cancellation
            try:
                return await asyncio.wait_for(exec_task, timeout=self.cancel_timeout)
            except asyncio.TimeoutError:
                # Force termination
                self.logger.warning(f"Force terminating plugin for job {job.id[:8]}")
                exec_task.cancel()
                
                # Reload the plugin to get a fresh worker
                self.manager.reload_plugin(job.plugin_name)
                raise asyncio.CancelledError()
        
        await asyncio.sleep(0.1)
    
    return exec_task.result()

async def _poll_progress(
    self,
    job: Job,
    plugin: Any
) -> None:
    """Poll progress from the plugin during execution."""
    while True:
        try:
            if hasattr(plugin, 'get_progress_async'):
                progress_info = await plugin.get_progress_async()
            elif hasattr(plugin, 'get_progress'):
                progress_info = plugin.get_progress()
            else:
                break  # No progress support
            
            job.progress = progress_info.get('progress', 0.0)
            job.status_message = progress_info.get('message', '')
            
        except Exception:
            pass  # Ignore progress polling errors
        
        await asyncio.sleep(self.progress_poll_interval)

JobQueue._move_to_history = _move_to_history
JobQueue._signal_job_completed = _signal_job_completed
JobQueue._process_loop = _process_loop
JobQueue._execute_job = _execute_job
JobQueue._execute_with_cancellation = _execute_with_cancellation
JobQueue._poll_progress = _poll_progress

## Usage Example

```python
from cjm_plugin_system.core.manager import PluginManager
from cjm_plugin_system.core.queue import JobQueue, JobStatus
from cjm_plugin_system.core.scheduling import QueueScheduler

# Setup
manager = PluginManager(scheduler=QueueScheduler())
manager.discover_manifests()
manager.load_plugin(manager.get_discovered_meta("sys-mon"))
manager.register_system_monitor("sys-mon")

# Create queue
queue = JobQueue(manager)
await queue.start()

# Submit jobs
job1_id = await queue.submit("whisper", audio="/path/to/audio1.mp3")
job2_id = await queue.submit("gemini-vision", image="/path/to/image.png")
job3_id = await queue.submit("whisper", audio="/path/to/audio2.mp3", priority=10)

# Monitor queue state
state = queue.get_state()
print(f"Running: {state['running']}")
print(f"Pending: {len(state['pending'])} jobs")

# Cancel a job
await queue.cancel(job2_id)

# Wait for a job to complete
job1 = await queue.wait_for_job(job1_id)
if job1.status == JobStatus.completed:
    print(job1.result)

# Cleanup
await queue.stop()
manager.unload_all()
```

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()