In [40]:
import pandas as pd

df = pd.read_csv('./cleaned.csv')

df.loc[df.prev_action_1.isna(), 'prev_action_1'] = 'FirstAction'
df.loc[df.prev_action_2.isna(), 'prev_action_2'] = '2ndAction'
df.loc[df.prev_action_3.isna(), 'prev_action_3'] = '3rdAction'

df['parameters'] = df['parameters'].apply(lambda x: eval(x) if isinstance(x, str) else x)
df['para1'] = df['parameters'].apply(lambda x: x[0] if len(x) > 0 else 'NULL')
df['para2'] = df['parameters'].apply(lambda x: x[1] if len(x) > 1 else 'NULL')
df['para3'] = df['parameters'].apply(lambda x: x[2] if len(x) > 2 else 'NULL')

In [41]:
import xgboost as xgb
import pandas as pd
import numpy as np
from xgboost import XGBRanker

def create_ranking_dataset(df):
    """
    Transform sequential data into ranking format for XGBRanker
    Each context gets multiple candidate actions to rank
    """
    ranking_data = []
    group_sizes = []
    
    # Cast action and categorical columns as categorical
    df['action'] = df['action'].astype('category')
    cat_cols = ['prev_action_1', 'prev_action_2', 'prev_action_3', 'para1', 'para2', 'para3']
    for col in cat_cols:
        df[col] = df[col].astype('category')
    
    # Define context columns (features that define a unique context)
    context_cols = ['prev_action_1', 'prev_action_2', 'prev_action_3', 'para1', 'para2', 'para3']
    
    # Group by context to create ranking groups
    for context_id, group in df.groupby(context_cols, observed=False):
        # Get all possible actions in the dataset
        all_actions = df['action'].cat.categories.tolist()
        
        # Extract context features from the first row in this group
        context_row = group[['seconds_passed', 'pa1_ss', 'pa2_ss', 'pa3_ss', 
                           'day', 'month', 'week', 'year'] + context_cols].iloc[0]
        context_features = context_row.to_dict()
        
        # Create candidate-context pairs for each possible action
        group_size = 0
        for action in all_actions:
            # Create features for this context-action pair
            candidate_features = context_features.copy()
            candidate_features['candidate_action'] = action
            
            # Label: 1 if this action was actually taken in this context, 0 otherwise
            label = 1 if action in group['action'].values else 0
            candidate_features['label'] = label
            
            ranking_data.append(candidate_features)
            group_size += 1
        
        group_sizes.append(group_size)
    
    return pd.DataFrame(ranking_data), group_sizes

def prepare_features(ranking_df):
    """
    Prepare features for training
    """
    # Cast candidate_action as categorical
    ranking_df['candidate_action'] = ranking_df['candidate_action'].astype('category')
    
    # Define all feature columns
    features = [
        'seconds_passed', 'pa1_ss', 'pa2_ss', 'pa3_ss',
        'day', 'month', 'week', 'year',
        'prev_action_1', 'prev_action_2', 'prev_action_3',
        'para1', 'para2', 'para3', 'candidate_action'
    ]
    
    # Ensure all categorical features are properly encoded
    cat_cols = ['prev_action_1', 'prev_action_2', 'prev_action_3', 
                'para1', 'para2', 'para3', 'candidate_action']
    for col in cat_cols:
        if col in ranking_df.columns:
            ranking_df[col] = ranking_df[col].astype('category')
    
    return features, cat_cols

def train_xgb_ranker(X, y, group_sizes):
    """
    Train XGBRanker model
    """
    model = XGBRanker(
        objective='rank:ndcg',
        learning_rate=0.1,
        max_depth=6,
        n_estimators=100,
        enable_categorical=True,
        random_state=42
    )
    
    model.fit(X, y, group=group_sizes)
    return model

def predict_top_k_actions(model, context_features, all_actions, features_list, cat_cols, k=5):
    """
    Predict top-k ranked actions for a given context
    
    Args:
        model: Trained XGBRanker
        context_features: Dictionary with context information
        all_actions: List of all possible actions
        features_list: List of feature names used in training
        cat_cols: List of categorical column names
        k: Number of top actions to return
    
    Returns:
        List of tuples (action, score) sorted by score descending
    """
    candidates = []
    
    # Create candidate features for each possible action
    for action in all_actions:
        candidate = context_features.copy()
        candidate['candidate_action'] = action
        candidates.append(candidate)
    
    # Convert to DataFrame
    candidates_df = pd.DataFrame(candidates)
    
    # Ensure categorical columns match training data
    for col in cat_cols:
        if col in candidates_df.columns:
            candidates_df[col] = candidates_df[col].astype('category')
    
    # Predict scores
    X_test = candidates_df[features_list]
    scores = model.predict(X_test)
    
    # Rank actions by score
    action_scores = list(zip(all_actions, scores))
    ranked_actions = sorted(action_scores, key=lambda x: x[1], reverse=True)
    
    return ranked_actions[:k]

# Main execution pipeline
def main():
    # Assuming your DataFrame 'df' is already loaded with the required columns:
    # ['action', 'seconds_passed', 'pa1_ss', 'pa2_ss', 'pa3_ss', 
    #  'day', 'month', 'week', 'year', 'prev_action_1', 'prev_action_2', 
    #  'prev_action_3', 'para1', 'para2', 'para3']
    
    print("Step 1: Creating ranking dataset...")
    ranking_df, group_sizes = create_ranking_dataset(df)
    print(f"Ranking dataset shape: {ranking_df.shape}")
    print(f"Number of ranking groups: {len(group_sizes)}")
    
    print("\nStep 2: Preparing features...")
    features, cat_cols = prepare_features(ranking_df)
    
    X = ranking_df[features]
    y = ranking_df['label']
    print(f"Training data shape: X={X.shape}, y={y.shape}")
    print(f"Label distribution: {y.value_counts().to_dict()}")
    
    print("\nStep 3: Training XGBRanker...")
    model = train_xgb_ranker(X, y, group_sizes)
    print("Model training completed!")
    
    print("\nStep 4: Testing predictions...")
    # Example context for prediction
    test_context = {
        'seconds_passed': 120,
        'pa1_ss': 0.5,
        'pa2_ss': 0.3,
        'pa3_ss': 0.2,
        'day': 15,
        'month': 9,
        'week': 37,
        'year': 2025,
        'prev_action_1': 'login',
        'prev_action_2': 'browse',
        'prev_action_3': 'search',
        'para1': 'electronics',
        'para2': 'mobile',
        'para3': 'apple'
    }
    
    all_actions = df['action'].cat.categories.tolist()
    top_actions = predict_top_k_actions(model, test_context, all_actions, features, cat_cols, k=5)
    
    print("\nTop 5 predicted actions:")
    for i, (action, score) in enumerate(top_actions, 1):
        print(f"{i}. {action}: {score:.4f}")
    
    return model, features, cat_cols, all_actions

# Alternative: Using native XGBoost with DMatrix (if you prefer the original approach)
def train_with_dmatrix(ranking_df, group_sizes, features):
    """
    Alternative training method using XGBoost DMatrix
    """
    X = ranking_df[features]
    y = ranking_df['label']
    
    # Create DMatrix
    train_dmatrix = xgb.DMatrix(X, label=y, enable_categorical=True)
    train_dmatrix.set_group(group_sizes)
    
    # Training parameters
    params = {
        'objective': 'rank:ndcg',
        'eval_metric': 'ndcg@5',
        'learning_rate': 0.1,
        'max_depth': 6,
        'lambdarank_pair_method': 'topk',
        'lambdarank_num_pair_per_sample': 5,
        'random_state': 42
    }
    
    model = xgb.train(params, train_dmatrix, num_boost_round=100)
    return model

def predict_with_dmatrix(model, context_features, all_actions, features_list, cat_cols, k=5):
    """
    Prediction function for DMatrix-trained model
    """
    candidates = []
    for action in all_actions:
        candidate = context_features.copy()
        candidate['candidate_action'] = action
        candidates.append(candidate)
    
    candidates_df = pd.DataFrame(candidates)
    for col in cat_cols:
        if col in candidates_df.columns:
            candidates_df[col] = candidates_df[col].astype('category')
    
    X_test = xgb.DMatrix(candidates_df[features_list], enable_categorical=True)
    scores = model.predict(X_test)
    
    action_scores = list(zip(all_actions, scores))
    ranked_actions = sorted(action_scores, key=lambda x: x[1], reverse=True)
    
    return ranked_actions[:k]

In [None]:
model, features, cat_cols, all_actions = main()
    
# For new predictions:
new_context = {
        'seconds_passed': 90,
        'pa1_ss': 0.7,
        'pa2_ss': 0.4,
        'pa3_ss': 0.1,
        'day': 20,
        'month': 10,
        'week': 42,
        'year': 2025,
        'prev_action_1': 'browse',
        'prev_action_2': 'search',
        'prev_action_3': 'view',
        'para1': 'books',
        'para2': 'laptop',
        'para3': 'samsung'
    }
    
predictions = predict_top_k_actions(model, new_context, all_actions, features, cat_cols, k=3)
print("\nNew prediction example:")
for i, (action, score) in enumerate(predictions, 1):
    print(f"{i}. {action}: {score:.4f}")

Step 1: Creating ranking dataset...
Ranking dataset shape: (91392, 16)
Number of ranking groups: 3264

Step 2: Preparing features...
Training data shape: X=(91392, 15), y=(91392,)
Label distribution: {0: 88089, 1: 3303}

Step 3: Training XGBRanker...
Model training completed!

Step 4: Testing predictions...

Top 5 predicted actions:
1. POST /sprints/{sprintId}/tickets: 0.4466
2. PUT /budgets/{budget_id}: 0.1282
3. PUT /costs/{service_id}/{cost_id}: 0.0901
4. DELETE /tickets/{ticketId}: 0.0645
5. PUT /invoices/{invoice_id}: 0.0585

New prediction example:
1. POST /sprints/{sprintId}/tickets: 0.4466
2. PUT /budgets/{budget_id}: 0.1282
3. PUT /costs/{service_id}/{cost_id}: 0.0901
