# Retry Incidents

This notebook provides an interactive tool to search for and retry incidents in Operaton/Camunda 7.

## Features
- Filter incidents by process definition key, error message, topic name, or worker ID
- Toggle between external task incidents only or all incident types
- Retry selected incidents individually or all at once
- View incident details with links to Cockpit

## Incident Types
- **External Task Incidents** (`failedExternalTask`): Can be retried by setting retries > 0
- **Other Incidents** (`failedJob`, etc.): Can be retried via job retry API

## Usage
1. Run all cells in order
2. Use filters to find specific incidents
3. Click "Search" to filter
4. Click "Retry Selected" or "Retry All" to retry incidents

In [None]:
# Imports and Environment Setup
import json
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import operaton
from operaton import Operaton

await operaton.load_env()

print("✓ Environment loaded")

In [None]:
# State management
state = {
    'incidents': [],
    'selected_ids': set()
}

# Filter Widgets
w_process_key = widgets.Text(
    value='',
    placeholder='Process definition key (partial)',
    description='Process:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='300px')
)

w_message = widgets.Text(
    value='',
    placeholder='Error message (partial)',
    description='Message:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='300px')
)

w_topic = widgets.Text(
    value='',
    placeholder='External task topic name',
    description='Topic:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='300px')
)

w_worker_id = widgets.Text(
    value='',
    placeholder='Worker ID (partial)',
    description='Worker:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='300px')
)

w_external_only = widgets.Checkbox(
    value=True,
    description='External task incidents only',
    layout=widgets.Layout(width='250px')
)

w_root_cause_only = widgets.Checkbox(
    value=True,
    description='Root cause incidents only',
    layout=widgets.Layout(width='250px')
)

# Action Buttons
w_search_btn = widgets.Button(
    description='Search',
    button_style='primary',
    icon='search',
    layout=widgets.Layout(width='120px')
)

w_retry_selected_btn = widgets.Button(
    description='Retry Selected',
    button_style='warning',
    icon='refresh',
    layout=widgets.Layout(width='150px')
)

w_retry_all_btn = widgets.Button(
    description='Retry All',
    button_style='danger',
    icon='refresh',
    layout=widgets.Layout(width='120px')
)

# Output areas
out_incidents = widgets.Output()
out_result = widgets.Output()

# Selection widget (multi-select)
w_incident_select = widgets.SelectMultiple(
    options=[],
    description='Select:',
    style={'description_width': '60px'},
    layout=widgets.Layout(width='100%', height='200px')
)

In [None]:
# Event Handlers
def on_search_click(btn):
    """Search for incidents based on filters."""
    with out_result:
        clear_output()
    with out_incidents:
        clear_output(wait=True)
        
        # Build query parameters
        params = ['sortBy=createTime', 'sortOrder=desc', 'open=true']
        
        # Incident type filter
        if w_external_only.value:
            params.append('incidentType=failedExternalTask')
        
        # Process key filter
        if w_process_key.value.strip():
            keys = [d['key'] for d in Operaton.get(f"/process-definition?keyLike=%{w_process_key.value.strip()}%")]
            if keys:
                params.append(f"processDefinitionKeyIn={','.join(keys)}")
            else:
                print("No matching process definitions found.")
                state['incidents'] = []
                w_incident_select.options = []
                return
        
        # Message filter
        if w_message.value.strip():
            params.append(f"incidentMessageLike=%{w_message.value.strip()}%")
        
        query = '&'.join(params)
        incidents = Operaton.get(f'/history/incident?{query}')
        
        # Filter root cause only
        if w_root_cause_only.value:
            incidents = [i for i in incidents if i['id'] == i.get('rootCauseIncidentId', i['id'])]
        
        # For external task incidents, get additional details for topic/worker filtering
        if w_external_only.value and (w_topic.value.strip() or w_worker_id.value.strip()):
            # Get external task details for each incident
            filtered_incidents = []
            for inc in incidents:
                ext_task_id = inc.get('configuration')
                if ext_task_id:
                    try:
                        # Try to get from history (works for completed/failed tasks)
                        ext_tasks = Operaton.get(f'/history/external-task-log?externalTaskId={ext_task_id}&maxResults=1')
                        if ext_tasks:
                            ext_task = ext_tasks[0]
                            topic = ext_task.get('topicName', '')
                            worker = ext_task.get('workerId', '')
                            
                            # Apply topic filter
                            if w_topic.value.strip() and w_topic.value.strip().lower() not in topic.lower():
                                continue
                            
                            # Apply worker filter
                            if w_worker_id.value.strip() and w_worker_id.value.strip().lower() not in worker.lower():
                                continue
                            
                            inc['_topicName'] = topic
                            inc['_workerId'] = worker
                            filtered_incidents.append(inc)
                    except:
                        # If we can't get details, include it anyway unless filters are set
                        if not w_topic.value.strip() and not w_worker_id.value.strip():
                            filtered_incidents.append(inc)
            incidents = filtered_incidents
        
        state['incidents'] = incidents
        
        if not incidents:
            print("No incidents found matching the criteria.")
            w_incident_select.options = []
            return
        
        # Build selection options
        options = []
        for i, inc in enumerate(incidents):
            create_time = inc.get('createTime', '')[:19] if inc.get('createTime') else 'N/A'
            proc_key = inc.get('processDefinitionKey', 'N/A')
            msg = inc.get('incidentMessage', '')[:60] + ('...' if len(inc.get('incidentMessage', '')) > 60 else '')
            inc_type = inc.get('incidentType', 'N/A')
            
            label = f"{i+1}. [{inc_type}] {proc_key} - {msg}"
            options.append((label, inc['id']))
        
        w_incident_select.options = options
        
        # Display table
        html = f"""
        <h4>Found {len(incidents)} incidents</h4>
        <table style="border-collapse: collapse; width: 100%; font-size: 12px;">
            <tr style="background: #f0f0f0;">
                <th style="padding: 4px; border: 1px solid #ddd;">#</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Time</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Type</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Process</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Topic</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Worker</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Message</th>
                <th style="padding: 4px; border: 1px solid #ddd;">Link</th>
            </tr>
        """
        for i, inc in enumerate(incidents):
            create_time = inc.get('createTime', '')[:19] if inc.get('createTime') else 'N/A'
            proc_key = inc.get('processDefinitionKey', 'N/A')
            msg = inc.get('incidentMessage', '')[:80]
            inc_type = inc.get('incidentType', 'N/A').replace('failed', '')
            topic = inc.get('_topicName', '-')
            worker = inc.get('_workerId', '-')
            pi_id = inc.get('processInstanceId', '')
            
            html += f"""
            <tr>
                <td style="padding: 4px; border: 1px solid #ddd;">{i+1}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">{create_time}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">{inc_type}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">{proc_key}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">{topic}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">{worker}</td>
                <td style="padding: 4px; border: 1px solid #ddd; max-width: 300px; overflow: hidden;">{msg}</td>
                <td style="padding: 4px; border: 1px solid #ddd;">
                    <a target="_blank" href="/camunda/app/cockpit/default/#/process-instance/{pi_id}/runtime?tab=incidents-tab">Open</a>
                </td>
            </tr>
            """
        html += "</table>"
        display(HTML(html))

def on_retry_selected_click(btn):
    """Retry only selected incidents."""
    selected_ids = list(w_incident_select.value)
    if not selected_ids:
        with out_result:
            clear_output(wait=True)
            print("⚠️ No incidents selected. Use the selection list to choose incidents.")
        return
    
    selected_incidents = [i for i in state['incidents'] if i['id'] in selected_ids]
    retry_incidents(selected_incidents, "selected")

def on_retry_all_click(btn):
    """Retry all found incidents."""
    if not state['incidents']:
        with out_result:
            clear_output(wait=True)
            print("⚠️ No incidents to retry. Search for incidents first.")
        return
    
    retry_incidents(state['incidents'], "all")

def retry_incidents(incidents, label):
    """Retry a list of incidents."""
    with out_result:
        clear_output(wait=True)
        
        # Separate by incident type
        external_task_incidents = [i for i in incidents if i.get('incidentType') == 'failedExternalTask']
        job_incidents = [i for i in incidents if i.get('incidentType') in ['failedJob']]
        other_incidents = [i for i in incidents if i.get('incidentType') not in ['failedExternalTask', 'failedJob']]
        
        print(f"Retrying {len(incidents)} {label} incidents...")
        print(f"  - External task incidents: {len(external_task_incidents)}")
        print(f"  - Job incidents: {len(job_incidents)}")
        print(f"  - Other incidents: {len(other_incidents)}")
        print()
        
        success_count = 0
        error_count = 0
        
        # Retry external task incidents (batch API)
        if external_task_incidents:
            ext_task_ids = [i['configuration'] for i in external_task_incidents if i.get('configuration')]
            if ext_task_ids:
                try:
                    payload = {
                        "retries": 1,
                        "externalTaskIds": ext_task_ids
                    }
                    Operaton.put("/external-task/retries", json=payload)
                    print(f"✅ Set retries for {len(ext_task_ids)} external tasks")
                    success_count += len(ext_task_ids)
                except Exception as e:
                    print(f"❌ Failed to retry external tasks: {e}")
                    error_count += len(ext_task_ids)
        
        # Retry job incidents (individual API calls)
        if job_incidents:
            for inc in job_incidents:
                job_id = inc.get('configuration')
                if job_id:
                    try:
                        Operaton.put(f"/job/{job_id}/retries", json={"retries": 1})
                        success_count += 1
                    except Exception as e:
                        print(f"❌ Failed to retry job {job_id}: {e}")
                        error_count += 1
            if success_count > 0:
                print(f"✅ Set retries for {success_count} jobs")
        
        # Other incident types may need different handling
        if other_incidents:
            print(f"⚠️ {len(other_incidents)} incidents of unsupported types cannot be auto-retried")
            for inc in other_incidents:
                print(f"   - {inc.get('incidentType')}: {inc.get('id')}")
        
        print()
        print(f"Summary: {success_count} succeeded, {error_count} failed")
        
        # Refresh the search
        if success_count > 0:
            print()
            print("Refreshing incident list...")
            on_search_click(None)

# Attach handlers
w_search_btn.on_click(on_search_click)
w_retry_selected_btn.on_click(on_retry_selected_click)
w_retry_all_btn.on_click(on_retry_all_click)

# Submit on Enter for text fields
w_process_key.on_submit(lambda x: on_search_click(None))
w_message.on_submit(lambda x: on_search_click(None))
w_topic.on_submit(lambda x: on_search_click(None))
w_worker_id.on_submit(lambda x: on_search_click(None))

In [None]:
# Display UI
display(HTML("<h3>Filters</h3>"))
display(widgets.VBox([
    widgets.HBox([w_process_key, w_message]),
    widgets.HBox([w_topic, w_worker_id]),
    widgets.HBox([w_external_only, w_root_cause_only]),
]))

display(HTML("<h3>Actions</h3>"))
display(widgets.HBox([w_search_btn, w_retry_selected_btn, w_retry_all_btn]))

display(HTML("<h3>Incidents</h3>"))
display(out_incidents)

display(HTML("<h4>Select incidents to retry:</h4>"))
display(w_incident_select)

display(HTML("<h3>Result</h3>"))
display(out_result)

# Initial search
on_search_click(None)