In [7]:
import xgboost as xgb
import pandas as pd
from datetime import datetime

df = pd.read_csv('./cleaned.csv')

model = xgb.XGBRanker()
model.load_model('./ranker.json')

In [11]:
data = {
    "user_id": "user_1",
    "events": [
        {
            "ts": "2025-03-05 13:20:17+00:00",
            "endpoint": "GET /invoices",
            "sesson_id": "76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33",
            "params": {'board_id'}
        },
        {
            "ts": "2025-03-05 13:22:01+00:00",
            "endpoint": "PUT /invoices/123/status",
            "sesson_id": "76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33",
            "params": {
                "status": "DRAFT"
            }
        }
    ],
    "prompt": "Let's finish billing for Q2",
    "spec_url": "https://raw.githubusercontent.com/damoonsh/OS-Next-Action/refs/heads/main/specs/ops.yaml",
    "k": 5
}

In [None]:
def build_endpoint_map(doc_str):
    endpoint_map = {}

    for line in doc_str.splitlines():
        line = line.strip()
        if not line or line.startswith('#'):
            continue
        if line.startswith('-'):
            parts = line[2:].split(' ', 2)
            if len(parts) < 2:
                continue
            method, path = parts[0], parts[1]

            if method not in endpoint_map:
                endpoint_map[method] = []
            endpoint_map[method].append(path)

    return endpoint_map

def match_endpoint(request_str, endpoint_map):
    parts = request_str.strip().split(' ', 1)
    if len(parts) < 2:
        return None

    method, request_path = parts[0], parts[1]
    candidates = endpoint_map.get(method, [])
    request_segments = request_path.strip('/').split('/')

    for template in candidates:
        template_segments = template.strip('/').split('/')
        if len(request_segments) != len(template_segments):
            continue
        match = True
        for seg, temp_seg in zip(request_segments, template_segments):
            if temp_seg.startswith('{') and temp_seg.endswith('}'):
                continue
            if seg != temp_seg:
                match = False
                break
        if match:
            return template 

    return None

doc = """# Invoice Management  
- GET /invoices/ - Retrieve all invoices with optional filtering parameters  
- POST /invoices/ - Create a new invoice with customer and line item details  
- GET /invoices/{invoice_id} - Retrieve a specific invoice by ID  
- PUT /invoices/{invoice_id} - Update an existing invoice  
- PATCH /invoices/{invoice_id} - Partially update invoice fields  
- DELETE /invoices/{invoice_id} - Delete an invoice  
- GET /invoices/{invoice_id}/status - Get the current status of a specific invoice  
- PUT /invoices/{invoice_id}/status - Update the status of a specific invoice  

# Cost Management  
- GET /costs/{service_id} - Retrieve total costs for a specific service with date filtering  
- POST /costs/{service_id} - Add new cost entry for a service  
- DELETE /costs/{service_id} - Delete all cost entries for a service (requires confirmation)  
- PUT /costs/{service_id}/{cost_id} - Update an existing cost entry  
- PATCH /costs/{service_id}/{cost_id} - Partially update cost entry fields  
- DELETE /costs/{service_id}/{cost_id} - Remove a specific cost entry  

# Revenue Management  
- GET /revenue/{service_id} - Retrieve total revenue for a specific service with date filtering  
- POST /revenue/{service_id} - Record new revenue entry for a service  
- PUT /revenue/{service_id}/{revenue_id} - Update an existing revenue record  
- PATCH /revenue/{service_id}/{revenue_id} - Partially update revenue record  
- DELETE /revenue/{service_id}/{revenue_id} - Remove a specific revenue entry  

# Payment Processing  
- GET /payments/ - List all payments with status, method, and date-range filters  
- POST /payments/ - Process a new payment for an invoice  
- GET /payments/{payment_id} - Retrieve details of a specific payment  
- PATCH /payments/{payment_id} - Partially update payment details  
- DELETE /payments/{payment_id} - Cancel/delete a payment  

# Account Management  
- GET /accounts/ - Retrieve all financial accounts  
- GET /accounts/{account_id} - Get specific account details and current balance  
- PATCH /accounts/{account_id} - Update account information  
- DELETE /accounts/{account_id} - Close/delete an account  

# Budget Planning and Tracking  
- GET /budgets/ - List all budgets with category and period filters  
- POST /budgets/ - Create a new budget  
- GET /budgets/{budget_id} - Retrieve specific budget details  
- PUT /budgets/{budget_id} - Update existing budget allocations  
- PATCH /budgets/{budget_id} - Partially update budget  
- DELETE /budgets/{budget_id} - Delete a budget 
"""

In [16]:
endpoint_map = build_endpoint_map(doc)
for idx in range(len(data['events'])):
    method = data['events'][idx]['endpoint'].split(" ")[0]
    data['events'][idx]['endpoint_abstract'] = f"{method} {match_endpoint(data['events'][idx]['endpoint'], endpoint_map)}"

In [17]:
data

{'user_id': 'user_1',
 'events': [{'ts': '2025-03-05 13:20:17+00:00',
   'endpoint': 'GET /invoices',
   'sesson_id': '76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33',
   'params': {'board_id'},
   'endpoint_abstract': 'GET /invoices/'},
  {'ts': '2025-03-05 13:22:01+00:00',
   'endpoint': 'PUT /invoices/123/status',
   'sesson_id': '76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33',
   'params': {'status': 'DRAFT'},
   'endpoint_abstract': 'PUT /invoices/{invoice_id}/status'}],
 'prompt': "Let's finish billing for Q2",
 'spec_url': 'https://raw.githubusercontent.com/damoonsh/OS-Next-Action/refs/heads/main/specs/ops.yaml',
 'k': 5}

In [23]:
def create_inference_data_v2(data, min_occurrences=3):
    """
    Create inference data structure from events data without requiring df to be passed
    Uses global df and extracts previous actions from user's history
    
    Args:
        data: Input data with events containing user_id, events list, etc.
        min_occurrences: Minimum occurrences needed in df to use an action
    
    Returns:
        Dictionary suitable for model inference matching your training data format
    """
    
    # Access global df (assumed to exist in scope)
    global df
    
    user_id = data["user_id"]
    events = data["events"]
    
    # Sort events by timestamp (latest first)
    events_sorted = sorted(events, key=lambda x: x["ts"], reverse=True)
    
    print("Events sorted by timestamp (latest first):")
    for i, event in enumerate(events_sorted):
        print(f"{i+1}. {event['endpoint_abstract']} at {event['ts']}")
    
    # Find a suitable action that appears at least min_occurrences times in df
    selected_action = None
    selected_event = None
    
    for event in events_sorted:
        action = event["endpoint_abstract"]
        action_count = df[df['action'] == action].shape[0]
        print(f"\nChecking action '{action}': {action_count} occurrences in df")
        
        if action_count >= min_occurrences:
            selected_action = action
            selected_event = event
            print(f"✓ Selected action: {action} (has {action_count} occurrences)")
            break
        else:
            print(f"✗ Skipping action: {action} (only {action_count} occurrences, need {min_occurrences})")
    
    if not selected_action:
        print(f"\nWarning: No action found with {min_occurrences}+ occurrences. Using the latest action.")
        selected_event = events_sorted[0]
        selected_action = selected_event["endpoint_abstract"]
    
    # Get user's historical actions to build previous action context
    user_history = df[df['user_id'] == user_id].copy()
    if not user_history.empty:
        # Sort by timestamp to get chronological order
        user_history['timestamp'] = pd.to_datetime(user_history['timestamp'])
        user_history = user_history.sort_values('timestamp')
        print(f"\nFound {len(user_history)} historical actions for user {user_id}")
    else:
        print(f"\nNo historical actions found for user {user_id}")
        user_history = pd.DataFrame()
    
    # Build previous actions context
    if len(events) >= 2:
        print(f"\nEvents has {len(events)} actions. Building previous action context...")
        
        # Extract actions from events (excluding the selected/latest one)
        event_actions = [event["endpoint_abstract"] for event in reversed(events)]  # Chronological order
        print(f"Event actions in chronological order: {event_actions}")
        
        # Find prev_action_3 from user's history if needed
        if len(event_actions) >= 2:
            # We have at least 2 actions from events
            prev_action_1 = event_actions[-2]  # Second to last action
            
            if len(event_actions) >= 3:
                # We have 3+ actions from events
                prev_action_2 = event_actions[-3]
                prev_action_3 = event_actions[-4] if len(event_actions) >= 4 else ""
            else:
                # Only 2 actions in events, need to find prev_action_2 and prev_action_3 from history
                prev_action_2 = ""
                prev_action_3 = ""
                
                # Look for the most recent action from user's history that's not in current events
                event_action_set = set(event_actions)
                
                if not user_history.empty:
                    # Find actions from history that are not in current events
                    historical_actions = user_history[~user_history['action'].isin(event_action_set)]
                    
                    if not historical_actions.empty:
                        # Get the most recent historical action
                        most_recent_historical = historical_actions.iloc[-1]
                        prev_action_3 = most_recent_historical['action']
                        print(f"Found prev_action_3 from user history: {prev_action_3}")
                    else:
                        print("No suitable prev_action_3 found in user history")
                        prev_action_3 = ""
        else:
            # Only 1 action in events
            prev_action_1 = ""
            prev_action_2 = ""
            prev_action_3 = ""
            
            # Try to get previous actions from user's history
            if not user_history.empty:
                historical_actions = user_history['action'].tolist()
                if len(historical_actions) >= 1:
                    prev_action_1 = historical_actions[-1]
                if len(historical_actions) >= 2:
                    prev_action_2 = historical_actions[-2]
                if len(historical_actions) >= 3:
                    prev_action_3 = historical_actions[-3]
        
        prev_actions = [prev_action_1, prev_action_2, prev_action_3]
        print(f"Built previous actions: {prev_actions}")
        
    else:
        print(f"\nEvents has only {len(events)} action(s). Using historical data if available.")
        # Use user's historical data to build context
        if not user_history.empty and len(user_history) >= 3:
            historical_actions = user_history['action'].tolist()
            prev_actions = [
                historical_actions[-1] if len(historical_actions) >= 1 else "",
                historical_actions[-2] if len(historical_actions) >= 2 else "",
                historical_actions[-3] if len(historical_actions) >= 3 else ""
            ]
        else:
            prev_actions = ["", "", ""]
    
    # Calculate time difference
    latest_event_time = datetime.fromisoformat(events_sorted[0]["ts"].replace('+00:00', '+00:00'))
    reference_time = datetime.fromisoformat(selected_event["ts"].replace('+00:00', '+00:00'))
    seconds_passed = int((latest_event_time - reference_time).total_seconds())
    
    # Extract parameters from the selected event
    params_list = []
    if 'params' in selected_event and selected_event['params']:
        if isinstance(selected_event['params'], dict):
            params_list = list(selected_event['params'].keys())
        elif isinstance(selected_event['params'], set):
            params_list = list(selected_event['params'])
        else:
            params_list = list(selected_event['params'])
    
    # Pad parameters to ensure we have at least 3
    while len(params_list) < 3:
        params_list.append('')
    
    # Get pa values based on whether previous actions exist
    pa_values = [
        1.0 if prev_actions[0] else 0.0,
        1.0 if prev_actions[1] else 0.0,
        1.0 if prev_actions[2] else 0.0
    ]
    
    # Get time components from the selected event
    event_dt = datetime.fromisoformat(selected_event["ts"].replace('+00:00', '+00:00'))
    
    # Create the inference data structure
    X_inference = {
        'seconds_passed': seconds_passed,
        'pa1_ss': pa_values[0],
        'pa2_ss': pa_values[1],
        'pa3_ss': pa_values[2],
        'day': event_dt.day,
        'month': event_dt.month,
        'week': event_dt.isocalendar()[1],
        'year': event_dt.year,
        'prev_action_1': prev_actions[0],
        'prev_action_2': prev_actions[1],
        'prev_action_3': prev_actions[2],
        'para1': params_list[0],
        'para2': params_list[1],
        'para3': params_list[2]
    }
    
    return X_inference

# Complete prediction pipeline
def predict_next_actions_from_events_v2(data, model, features, cat_cols, all_actions, k=5):
    """
    Complete pipeline: events data -> inference format -> model predictions
    Does not require df to be passed - uses global df
    """
    # Create inference data structure
    X_inference = create_inference_data_v2(data, min_occurrences=3)
    
    # Use existing prediction logic
    candidates = []
    for action in all_actions:
        candidate = X_inference.copy()
        candidate['candidate_action'] = action
        candidates.append(candidate)
    
    # Convert to DataFrame and ensure proper categorical encoding
    candidates_df = pd.DataFrame(candidates)
    for col in cat_cols:
        if col in candidates_df.columns:
            candidates_df[col] = candidates_df[col].astype('category')
    
    # Predict with model
    X_test = candidates_df[features]
    scores = model.predict(X_test)
    
    # Rank actions by score
    action_scores = list(zip(all_actions, scores))
    ranked_actions = sorted(action_scores, key=lambda x: x[1], reverse=True)
    
    return ranked_actions[:k]


In [24]:
data

{'user_id': 'user_1',
 'events': [{'ts': '2025-03-05 13:20:17+00:00',
   'endpoint': 'GET /invoices',
   'sesson_id': '76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33',
   'params': {'board_id'},
   'endpoint_abstract': 'GET /invoices/'},
  {'ts': '2025-03-05 13:22:01+00:00',
   'endpoint': 'PUT /invoices/123/status',
   'sesson_id': '76dbd4b8-18f3-4e6b-bf1d-94b2412a4e33',
   'params': {'status': 'DRAFT'},
   'endpoint_abstract': 'PUT /invoices/{invoice_id}/status'}],
 'prompt': "Let's finish billing for Q2",
 'spec_url': 'https://raw.githubusercontent.com/damoonsh/OS-Next-Action/refs/heads/main/specs/ops.yaml',
 'k': 5}

In [25]:
X = create_inference_data_v2(data, min_occurrences=3)

Events sorted by timestamp (latest first):
1. PUT /invoices/{invoice_id}/status at 2025-03-05 13:22:01+00:00
2. GET /invoices/ at 2025-03-05 13:20:17+00:00

Checking action 'PUT /invoices/{invoice_id}/status': 195 occurrences in df
✓ Selected action: PUT /invoices/{invoice_id}/status (has 195 occurrences)

Found 311 historical actions for user user_1

Events has 2 actions. Building previous action context...
Event actions in chronological order: ['PUT /invoices/{invoice_id}/status', 'GET /invoices/']
Found prev_action_3 from user history: POST /sprints
Built previous actions: ['PUT /invoices/{invoice_id}/status', '', 'POST /sprints']


In [26]:
X

{'seconds_passed': 0,
 'pa1_ss': 1.0,
 'pa2_ss': 0.0,
 'pa3_ss': 1.0,
 'day': 5,
 'month': 3,
 'week': 10,
 'year': 2025,
 'prev_action_1': 'PUT /invoices/{invoice_id}/status',
 'prev_action_2': '',
 'prev_action_3': 'POST /sprints',
 'para1': 'status',
 'para2': '',
 'para3': ''}

In [None]:
BASE_PROMPT = """
You are an intelligent API endpoint selector for a finance SaaS application. Your role is to analyze user interaction history and determine the most appropriate next endpoint to invoke based on the context, previous actions, and available API specifications.

## Instructions:

1. **Context Analysis**: Carefully examine the interaction history to understand:
   - What endpoints were previously called and with which parameters (shown in curly braces {{}})
   - The sequence of user actions and their outcomes
   - Any patterns or workflows the user is following
   - Current state of the application based on previous API calls

2. **Parameter Extraction**: Pay special attention to parameters passed in previous interactions:
   - Extract IDs, filters, and values from previous endpoint calls
   - Consider how these parameters influence the next logical step
   - Identify any missing parameters that might be needed

3. **Endpoint Selection Logic**: 
   - Choose endpoints that logically follow from the user's current workflow
   - Consider CRUD operation sequences (e.g., POST → GET → PATCH → DELETE)
   - Prioritize endpoints that complete user goals or provide necessary follow-up actions
   - Account for business logic constraints (e.g., can't delete invoices that aren't in 'draft' status)

4. **Response Format**: Provide your response as a JSON object with:
   - `selected_endpoint`: The HTTP method and path of the recommended endpoint
   - `reasoning`: Brief explanation of why this endpoint was chosen
   - `suggested_parameters`: Any parameters that should be included based on context
   - `confidence_level`: High/Medium/Low based on how certain you are about the selection

## Interaction History:

{history}

## Available API Specifications:

{api_specs}

## Additional Context:
{user_prompt_addition}
- Consider the current user's workflow state and business logic requirements
- If multiple endpoints seem equally valid, prioritize those that:
  1. Complete the current user task
  2. Provide essential follow-up information
  3. Enable the next logical step in the business process
- Account for any error conditions or validation requirements mentioned in the API specs
- Consider data dependencies between endpoints (e.g., needing invoice_id from previous GET /invoices/ call)

## Output Format:
Return a JSON array containing the most likely actions ordered by probability (most likely first). Each action should include the endpoint and reasoning:
{exclude_delete}
{{
  {{"action": "GET /invoices/123/", "reasoning": "User just created invoice 123 and likely wants to view the complete details"}},
  {{"action": "PATCH /invoices/123/status", "reasoning": "Natural next step would be to update the invoice status from draft to pending"}},
  {{"action": "GET /invoices/123/line-items/", "reasoning": "User might want to review or modify the line items of the newly created invoice"}}
}}

Analyze the interaction history and return your top 3 most likely next actions with their reasoning.
"""