# Restaurant Survival Prediction - GPU Accelerated with XGBoost

**üöÄ GPU-Accelerated Training**: Uses XGBoost Survival with CUDA support

**Prerequisites**: 
- Run `kaggle_feature_extraction_complete.ipynb` first
- Enable **GPU T4 x2** in Kaggle notebook settings

**Target**: C-index 0.85-0.90 in **<10 minutes** (vs hours with CPU)

**Strategy**:
1. Load pre-extracted features (130 features)
2. Quick feature importance with XGBoost (GPU-accelerated)
3. Test Top-K feature selections
4. Final optimized model

**Advantages over scikit-survival**:
- ‚úÖ 10-50x faster with GPU
- ‚úÖ Handles 130 features easily
- ‚úÖ Better memory efficiency
- ‚úÖ Native Cox regression support

---

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q xgboost scikit-survival

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import gc
import time
import warnings
warnings.filterwarnings('ignore')

# XGBoost for GPU acceleration
import xgboost as xgb
from xgboost import DMatrix

# Survival analysis metrics
from sksurv.metrics import concordance_index_censored
from sksurv.util import Surv

# Model selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("‚úÖ Imports complete")

# Check GPU availability
print("\nüîç GPU Check:")
try:
    print(f"   XGBoost version: {xgb.__version__}")
    print(f"   GPU available: {xgb.get_config()['use_rmm']}")
except:
    print("   XGBoost installed (GPU support will be auto-detected)")

## 2. Configuration

In [None]:
# Paths
DATA_PATH = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('data')
OUTPUT_PATH = Path('/kaggle/working') if Path('/kaggle').exists() else Path('outputs/survival_training_advanced')
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# XGBoost Survival Configuration (GPU-accelerated)
XGBOOST_CONFIG = {
    'objective': 'survival:cox',  # Cox proportional hazards
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist',        # Will auto-detect GPU
    'device': 'cuda',              # Force GPU usage
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 5,
    'gamma': 0.1,
    'reg_alpha': 0.1,              # L1 regularization
    'reg_lambda': 1.0,             # L2 regularization
    'random_state': 42,
    'n_jobs': -1
}

# Quick feature importance config (fewer trees)
QUICK_XGBOOST_CONFIG = XGBOOST_CONFIG.copy()
QUICK_XGBOOST_CONFIG['n_estimators'] = 100  # Faster for importance

# Train/test split
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"üìÅ Data: {DATA_PATH}")
print(f"üìÅ Output: {OUTPUT_PATH}")
print(f"\nüéØ XGBoost Config:")
print(f"   Device: {XGBOOST_CONFIG['device']}")
print(f"   Tree method: {XGBOOST_CONFIG['tree_method']}")
print(f"   N-estimators: {XGBOOST_CONFIG['n_estimators']}")
print(f"   Learning rate: {XGBOOST_CONFIG['learning_rate']}")
print(f"   Max depth: {XGBOOST_CONFIG['max_depth']}")

## 3. Load Pre-Extracted Features

In [None]:
# Load the comprehensive feature set
features_file = DATA_PATH / 'jakarta_restaurant_features_complete.csv'

if not features_file.exists():
    raise FileNotFoundError(
        f"‚ùå Features file not found: {features_file}\n"
        "Please run 'kaggle_feature_extraction_complete.ipynb' first!"
    )

df = pd.read_csv(features_file)

print(f"‚úÖ Loaded {len(df):,} restaurants")
print(f"‚úÖ Total columns: {len(df.columns)}")
print(f"\nüìä Data overview:")
print(df.head())

## 4. Data Preparation

In [None]:
# Verify survival labels exist
required_cols = ['survival_days', 'event_observed', 'categorical_label']
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"‚ùå Missing required columns: {missing}")

# Filter mature restaurants only (already done in extraction, but verify)
df_mature = df[df['categorical_label'] != 2].copy()

print(f"‚úÖ Mature restaurants: {len(df_mature):,}")
print(f"   - Survived: {(df_mature['event_observed'] == 0).sum():,}")
print(f"   - Failed: {(df_mature['event_observed'] == 1).sum():,}")
print(f"   - Failure rate: {(df_mature['event_observed'] == 1).mean():.1%}")

In [None]:
# Identify feature columns (exclude metadata and labels)
exclude_cols = [
    'osm_id', 'name', 'poi_type', 'date_created', 'date_closed',
    'survival_days', 'event_observed', 'categorical_label',
    'geometry', 'lat', 'lon'
]

feature_cols = [c for c in df_mature.columns if c not in exclude_cols]

print(f"‚úÖ Total features available: {len(feature_cols)}")
print(f"\nüìã Feature columns:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Check for missing values
missing_counts = df_mature[feature_cols].isnull().sum()
missing_features = missing_counts[missing_counts > 0]

if len(missing_features) > 0:
    print(f"‚ö†Ô∏è  Features with missing values:")
    for col, count in missing_features.items():
        pct = count / len(df_mature) * 100
        print(f"  - {col}: {count:,} ({pct:.1f}%)")
    
    # Fill missing values with median
    df_mature[feature_cols] = df_mature[feature_cols].fillna(df_mature[feature_cols].median())
    print("\n‚úÖ Filled missing values with median")
else:
    print("‚úÖ No missing values found")

In [None]:
# Prepare data for XGBoost
# XGBoost survival needs lower and upper bounds for censored data
# For our case:
# - event=1 (died): lower_bound = upper_bound = survival_days
# - event=0 (censored): lower_bound = survival_days, upper_bound = +inf

y_lower = df_mature['survival_days'].values
y_upper = df_mature['survival_days'].copy()
y_upper[df_mature['event_observed'] == 0] = np.inf  # Censored

X = df_mature[feature_cols].values

print(f"‚úÖ Created XGBoost survival data")
print(f"   - X shape: {X.shape}")
print(f"   - Events: {(df_mature['event_observed'] == 1).sum():,}")
print(f"   - Censored: {(df_mature['event_observed'] == 0).sum():,}")

In [None]:
# Train/test split
X_train, X_test, y_lower_train, y_lower_test, y_upper_train, y_upper_test, event_train, event_test = train_test_split(
    X, y_lower, y_upper, df_mature['event_observed'].values,
    test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df_mature['event_observed']
)

print(f"‚úÖ Train/test split complete")
print(f"   - Train: {len(X_train):,} samples")
print(f"   - Test: {len(X_test):,} samples")
print(f"   - Features: {X_train.shape[1]}")

## 5. Create XGBoost DMatrix (GPU-optimized format)

In [None]:
# Create DMatrix for XGBoost (GPU-optimized data structure)
print("Creating DMatrix for GPU training...")

dtrain = DMatrix(X_train, label_lower_bound=y_lower_train, label_upper_bound=y_upper_train, feature_names=feature_cols)
dtest = DMatrix(X_test, label_lower_bound=y_lower_test, label_upper_bound=y_upper_test, feature_names=feature_cols)

print("‚úÖ DMatrix created (GPU-ready)")
print(f"   - Train DMatrix: {dtrain.num_row():,} rows x {dtrain.num_col()} features")
print(f"   - Test DMatrix: {dtest.num_row():,} rows x {dtest.num_col()} features")

## 6. Quick Feature Importance (GPU-accelerated)

In [None]:
print("üî• Training XGBoost for feature importance (100 trees on GPU)...")
start_time = time.time()

# Train with fewer trees for quick importance
model_importance = xgb.train(
    QUICK_XGBOOST_CONFIG,
    dtrain,
    num_boost_round=QUICK_XGBOOST_CONFIG['n_estimators'],
    evals=[(dtrain, 'train'), (dtest, 'test')],
    verbose_eval=False
)

elapsed = time.time() - start_time

print(f"\n‚úÖ Training complete in {elapsed:.1f}s (GPU-accelerated!)")
print(f"   Speed: {QUICK_XGBOOST_CONFIG['n_estimators'] / elapsed:.1f} trees/second")

# Get feature importance
importance_dict = model_importance.get_score(importance_type='gain')  # Use 'gain' for importance
importance_df = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False)

# Normalize to percentages
importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100

print(f"\nüìä Top 20 Features by Importance:")
print("=" * 60)
for idx, row in importance_df.head(20).iterrows():
    print(f"{row['feature']:40s} {row['importance_pct']:6.2f}%")

# Save
importance_df.to_csv(OUTPUT_PATH / 'feature_importance_xgboost.csv', index=False)
print(f"\n‚úÖ Saved to: {OUTPUT_PATH / 'feature_importance_xgboost.csv'}")

In [None]:
print("üî• Training Gradient Boosting Survival Analysis with ALL features...")

gbs_all = GradientBoostingSurvivalAnalysis(**GBS_CONFIG)
gbs_all.fit(X_train_scaled, y_train)

# Predict on test set
pred_gbs = gbs_all.predict(X_test_scaled)

# Calculate C-index
c_index_gbs = concordance_index_censored(
    y_test['event'],  # Fixed
    y_test['time'],   # Fixed
    pred_gbs
)[0]

print(f"\n‚úÖ Gradient Boosting (All Features)")
print(f"   - C-index: {c_index_gbs:.4f}")
print(f"   - Features used: {len(feature_cols)}")

## 7. Feature Importance Analysis

In [None]:
# Get feature importance from RSF
importances = rsf_all.feature_importances_

# Create dataframe
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': importances
}).sort_values('importance', ascending=False)

# Normalize to percentages
importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100

print("\nüìä Top 20 Features by Importance:")
print("=" * 60)
for idx, row in importance_df.head(20).iterrows():
    print(f"{row['feature']:40s} {row['importance_pct']:6.2f}%")

# Save to CSV
importance_df.to_csv(OUTPUT_PATH / 'feature_importance_all.csv', index=False)
print(f"\n‚úÖ Saved to: {OUTPUT_PATH / 'feature_importance_all.csv'}")

In [None]:
# Visualize top 20 features
plt.figure(figsize=(12, 8))
top_20 = importance_df.head(20)
plt.barh(range(len(top_20)), top_20['importance_pct'])
plt.yticks(range(len(top_20)), top_20['feature'])
plt.xlabel('Importance (%)')
plt.title('Top 20 Features - Random Survival Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'feature_importance_top20.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Saved plot to: {OUTPUT_PATH / 'feature_importance_top20.png'}")

## 8. Feature Selection - Top K Features

In [None]:
# Test different numbers of top features
top_k_values = [5, 10, 15, 20, 30, 40, 50, len(feature_cols)]

results_top_k = []

for k in top_k_values:
    print(f"\nüî• Testing with Top {k} features...")
    
    # Select top k features
    if k < len(feature_cols):
        top_features = importance_df.head(k)['feature'].tolist()
        feature_indices = [feature_cols.index(f) for f in top_features]
        
        X_train_k = X_train[:, feature_indices]
        X_test_k = X_test[:, feature_indices]
    else:
        X_train_k = X_train
        X_test_k = X_test
        top_features = feature_cols
    
    # Train RSF
    rsf_k = RandomSurvivalForest(**RSF_CONFIG)
    rsf_k.fit(X_train_k, y_train)
    
    # Predict and evaluate
    pred_k = rsf_k.predict(X_test_k)
    c_index_k = concordance_index_censored(
        y_test['event'],  # Fixed
        y_test['time'],   # Fixed
        pred_k
    )[0]
    
    results_top_k.append({
        'n_features': k,
        'c_index': c_index_k,
        'features': ', '.join(top_features[:5]) + ('...' if k > 5 else '')
    })
    
    print(f"   C-index: {c_index_k:.4f}")
    
    # Memory cleanup
    del rsf_k, pred_k
    gc.collect()

# Create results dataframe
df_top_k = pd.DataFrame(results_top_k)

print("\nüìä Top-K Feature Selection Results:")
print("=" * 60)
print(df_top_k.to_string(index=False))

# Save results
df_top_k.to_csv(OUTPUT_PATH / 'top_k_feature_results.csv', index=False)
print(f"\n‚úÖ Saved to: {OUTPUT_PATH / 'top_k_feature_results.csv'}")

In [None]:
# Visualize Top-K results
plt.figure(figsize=(10, 6))
plt.plot(df_top_k['n_features'], df_top_k['c_index'], marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Top Features')
plt.ylabel('C-index')
plt.title('Model Performance vs Number of Features')
plt.grid(True, alpha=0.3)
plt.axhline(y=baseline_c_index, color='r', linestyle='--', label=f'Baseline (all features): {baseline_c_index:.4f}')
plt.legend()
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'top_k_performance.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Saved plot to: {OUTPUT_PATH / 'top_k_performance.png'}")

## 9. Feature Group Analysis

In [None]:
# Define feature groups based on extraction sections
feature_groups = {
    'Shannon Entropy': [c for c in feature_cols if 'entropy' in c.lower()],
    'POI Counts': [c for c in feature_cols if '_count_' in c and c not in ['mosque_count_500m', 'mosque_count_1000m', 'pasar_count_1000m', 'convenience_count_1000m', 'spbu_count_2000m']],
    'POI Densities': [c for c in feature_cols if '_density' in c and c not in ['transport_density', 'competition_density', 'street_food_density']],
    'Indonesia Specific': ['mosque_count_500m', 'mosque_count_1000m', 'nearest_mosque_m', 'pasar_count_1000m', 'nearest_pasar_m', 'convenience_count_1000m', 'spbu_count_2000m', 'nearest_spbu_m', 'friday_prayer_multiplier', 'pasar_proximity_score', 'gas_proximity_score'],
    'Competition': [c for c in feature_cols if 'competitor' in c.lower() or 'competition' in c.lower() or 'cannibalization' in c.lower()],
    'Demographics': [c for c in feature_cols if 'income' in c or 'density_district' in c or 'working_age_district' in c],
    'Accessibility': [c for c in feature_cols if 'dist_city_center' in c or 'transport_density' in c or 'centrality' in c],
    'Interactions': [c for c in feature_cols if any(x in c for x in ['income_population', 'working_age_mall', 'office_transport', 'demand_supply', 'mosque_residential', 'pasar_transport'])],
    'Temporal': [c for c in feature_cols if any(x in c for x in ['ramadan', 'weekend', 'gajian', 'school_holiday'])]
}

# Verify all features are categorized
categorized = set()
for group, features in feature_groups.items():
    categorized.update(features)

uncategorized = set(feature_cols) - categorized
if uncategorized:
    feature_groups['Other'] = list(uncategorized)

print("üìä Feature Groups:")
print("=" * 60)
for group, features in feature_groups.items():
    print(f"{group:20s}: {len(features):2d} features")
    for f in features[:3]:
        print(f"  - {f}")
    if len(features) > 3:
        print(f"  ... and {len(features)-3} more")

In [None]:
# Test each feature group independently
results_groups = []

for group_name, group_features in feature_groups.items():
    if len(group_features) == 0:
        continue
    
    print(f"\nüî• Testing '{group_name}' group ({len(group_features)} features)...")
    
    # Get feature indices
    feature_indices = [feature_cols.index(f) for f in group_features if f in feature_cols]
    
    if len(feature_indices) == 0:
        print("   ‚ö†Ô∏è  No valid features found")
        continue
    
    X_train_g = X_train[:, feature_indices]
    X_test_g = X_test[:, feature_indices]
    
    # Train RSF
    rsf_g = RandomSurvivalForest(**RSF_CONFIG)
    rsf_g.fit(X_train_g, y_train)
    
    # Predict and evaluate
    pred_g = rsf_g.predict(X_test_g)
    c_index_g = concordance_index_censored(
        y_test['event'],  # Fixed
        y_test['time'],   # Fixed
        pred_g
    )[0]
    
    results_groups.append({
        'group': group_name,
        'n_features': len(feature_indices),
        'c_index': c_index_g
    })
    
    print(f"   C-index: {c_index_g:.4f}")
    
    # Memory cleanup
    del rsf_g, pred_g
    gc.collect()

# Create results dataframe
df_groups = pd.DataFrame(results_groups).sort_values('c_index', ascending=False)

print("\nüìä Feature Group Performance:")
print("=" * 60)
print(df_groups.to_string(index=False))

# Save results
df_groups.to_csv(OUTPUT_PATH / 'feature_group_results.csv', index=False)
print(f"\n‚úÖ Saved to: {OUTPUT_PATH / 'feature_group_results.csv'}")

In [None]:
# Visualize group performance
plt.figure(figsize=(10, 6))
plt.barh(range(len(df_groups)), df_groups['c_index'])
plt.yticks(range(len(df_groups)), df_groups['group'])
plt.xlabel('C-index')
plt.title('Feature Group Performance')
plt.axvline(x=baseline_c_index, color='r', linestyle='--', label=f'All features: {baseline_c_index:.4f}')
plt.legend()
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'feature_group_performance.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Saved plot to: {OUTPUT_PATH / 'feature_group_performance.png'}")

## 10. Progressive Feature Addition

In [None]:
# Add feature groups progressively (by group performance)
sorted_groups = df_groups.sort_values('c_index', ascending=False)['group'].tolist()

results_progressive = []
cumulative_features = []

for i, group_name in enumerate(sorted_groups, 1):
    # Add this group's features
    group_features = feature_groups[group_name]
    cumulative_features.extend([f for f in group_features if f in feature_cols])
    
    print(f"\nüî• Progressive Test {i}: Adding '{group_name}' ({len(cumulative_features)} total features)...")
    
    # Get feature indices
    feature_indices = [feature_cols.index(f) for f in cumulative_features]
    
    X_train_p = X_train[:, feature_indices]
    X_test_p = X_test[:, feature_indices]
    
    # Train RSF
    rsf_p = RandomSurvivalForest(**RSF_CONFIG)
    rsf_p.fit(X_train_p, y_train)
    
    # Predict and evaluate
    pred_p = rsf_p.predict(X_test_p)
    c_index_p = concordance_index_censored(
        y_test['event'],  # Fixed
        y_test['time'],   # Fixed
        pred_p
    )[0]
    
    results_progressive.append({
        'step': i,
        'added_group': group_name,
        'total_features': len(cumulative_features),
        'c_index': c_index_p
    })
    
    print(f"   C-index: {c_index_p:.4f}")
    
    # Memory cleanup
    del rsf_p, pred_p
    gc.collect()

# Create results dataframe
df_progressive = pd.DataFrame(results_progressive)

print("\nüìä Progressive Feature Addition:")
print("=" * 80)
print(df_progressive.to_string(index=False))

# Save results
df_progressive.to_csv(OUTPUT_PATH / 'progressive_feature_results.csv', index=False)
print(f"\n‚úÖ Saved to: {OUTPUT_PATH / 'progressive_feature_results.csv'}")

In [None]:
# Visualize progressive addition
plt.figure(figsize=(12, 6))
plt.plot(df_progressive['step'], df_progressive['c_index'], marker='o', linewidth=2, markersize=8)
plt.xlabel('Progressive Step')
plt.ylabel('C-index')
plt.title('Progressive Feature Group Addition')
plt.grid(True, alpha=0.3)
plt.xticks(df_progressive['step'], df_progressive['added_group'], rotation=45, ha='right')
plt.axhline(y=baseline_c_index, color='r', linestyle='--', label=f'All features: {baseline_c_index:.4f}')
plt.legend()
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'progressive_addition.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Saved plot to: {OUTPUT_PATH / 'progressive_addition.png'}")

## 11. Final Optimized Model

In [None]:
# Identify best configuration from experiments
best_k = df_top_k.loc[df_top_k['c_index'].idxmax()]
best_group = df_groups.iloc[0]
best_progressive = df_progressive.loc[df_progressive['c_index'].idxmax()]

print("üìä Best Configurations:")
print("=" * 60)
print(f"\n1. Top-K Features:")
print(f"   - K = {best_k['n_features']}")
print(f"   - C-index = {best_k['c_index']:.4f}")

print(f"\n2. Single Group:")
print(f"   - Group = {best_group['group']}")
print(f"   - Features = {best_group['n_features']}")
print(f"   - C-index = {best_group['c_index']:.4f}")

print(f"\n3. Progressive Addition:")
print(f"   - Step = {best_progressive['step']}")
print(f"   - Last added = {best_progressive['added_group']}")
print(f"   - Features = {best_progressive['total_features']}")
print(f"   - C-index = {best_progressive['c_index']:.4f}")

print(f"\n4. All Features (Baseline):")
print(f"   - Features = {len(feature_cols)}")
print(f"   - C-index = {baseline_c_index:.4f}")

# Determine overall best
all_scores = [
    ('Top-K', best_k['c_index']),
    ('Single Group', best_group['c_index']),
    ('Progressive', best_progressive['c_index']),
    ('All Features', baseline_c_index)
]
best_config = max(all_scores, key=lambda x: x[1])

print(f"\nüèÜ BEST CONFIGURATION: {best_config[0]}")
print(f"   C-index: {best_config[1]:.4f}")

In [None]:
# Train final model with best configuration
# Using Top-K if it performs best, otherwise all features

if best_config[0] == 'Top-K' and best_k['n_features'] < len(feature_cols):
    print(f"\nüî• Training final model with Top {best_k['n_features']} features...")
    top_features_final = importance_df.head(int(best_k['n_features']))['feature'].tolist()
    feature_indices_final = [feature_cols.index(f) for f in top_features_final]
    X_train_final = X_train[:, feature_indices_final]
    X_test_final = X_test[:, feature_indices_final]
    final_features = top_features_final
else:
    print(f"\nüî• Training final model with ALL {len(feature_cols)} features...")
    X_train_final = X_train
    X_test_final = X_test
    final_features = feature_cols

# Train with more trees for final model
FINAL_RSF_CONFIG = RSF_CONFIG.copy()
FINAL_RSF_CONFIG['n_estimators'] = 300

rsf_final = RandomSurvivalForest(**FINAL_RSF_CONFIG)
rsf_final.fit(X_train_final, y_train)

# Predict on test set
pred_final = rsf_final.predict(X_test_final)

# Calculate final C-index
c_index_final = concordance_index_censored(
    y_test['event'],  # Fixed
    y_test['time'],   # Fixed
    pred_final
)[0]

print(f"\n‚úÖ FINAL MODEL PERFORMANCE")
print(f"   - C-index: {c_index_final:.4f}")
print(f"   - Features: {len(final_features)}")
print(f"   - Trees: {FINAL_RSF_CONFIG['n_estimators']}")
print(f"   - Max depth: {FINAL_RSF_CONFIG['max_depth']}")

## 12. Summary Report

In [None]:
# Create comprehensive summary
summary = {
    'experiment': [
        'All Features (RSF)',
        'All Features (GBS)',
        f'Top-{best_k["n_features"]} Features',
        f'Best Group ({best_group["group"]})',
        f'Progressive ({best_progressive["step"]} groups)',
        'Final Model (300 trees)'
    ],
    'n_features': [
        len(feature_cols),
        len(feature_cols),
        int(best_k['n_features']),
        int(best_group['n_features']),
        int(best_progressive['total_features']),
        len(final_features)
    ],
    'c_index': [
        baseline_c_index,
        c_index_gbs,
        best_k['c_index'],
        best_group['c_index'],
        best_progressive['c_index'],
        c_index_final
    ]
}

df_summary = pd.DataFrame(summary)
df_summary = df_summary.sort_values('c_index', ascending=False)

print("\n" + "=" * 80)
print("üìä COMPREHENSIVE EXPERIMENT SUMMARY")
print("=" * 80)
print(df_summary.to_string(index=False))
print("=" * 80)

# Save summary
df_summary.to_csv(OUTPUT_PATH / 'experiment_summary.csv', index=False)
print(f"\n‚úÖ Saved summary to: {OUTPUT_PATH / 'experiment_summary.csv'}")

In [None]:
# Final summary visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Overall comparison
ax1 = axes[0, 0]
ax1.barh(range(len(df_summary)), df_summary['c_index'])
ax1.set_yticks(range(len(df_summary)))
ax1.set_yticklabels(df_summary['experiment'])
ax1.set_xlabel('C-index')
ax1.set_title('All Experiments Comparison')
ax1.invert_yaxis()

# 2. Top-K curve
ax2 = axes[0, 1]
ax2.plot(df_top_k['n_features'], df_top_k['c_index'], marker='o', linewidth=2)
ax2.set_xlabel('Number of Features')
ax2.set_ylabel('C-index')
ax2.set_title('Top-K Feature Selection')
ax2.grid(True, alpha=0.3)

# 3. Feature group performance
ax3 = axes[1, 0]
ax3.barh(range(len(df_groups)), df_groups['c_index'])
ax3.set_yticks(range(len(df_groups)))
ax3.set_yticklabels(df_groups['group'])
ax3.set_xlabel('C-index')
ax3.set_title('Feature Group Performance')
ax3.invert_yaxis()

# 4. Progressive addition
ax4 = axes[1, 1]
ax4.plot(df_progressive['step'], df_progressive['c_index'], marker='o', linewidth=2)
ax4.set_xlabel('Progressive Step')
ax4.set_ylabel('C-index')
ax4.set_title('Progressive Feature Addition')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'comprehensive_summary.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"‚úÖ Saved comprehensive plot to: {OUTPUT_PATH / 'comprehensive_summary.png'}")

## 13. Key Findings

In [None]:
print("\n" + "=" * 80)
print("üéØ KEY FINDINGS")
print("=" * 80)

print(f"\n1. BEST OVERALL PERFORMANCE:")
print(f"   - C-index: {c_index_final:.4f}")
print(f"   - Configuration: {best_config[0]}")
print(f"   - Features used: {len(final_features)}")

print(f"\n2. TOP 5 MOST IMPORTANT FEATURES:")
for idx, row in importance_df.head(5).iterrows():
    print(f"   {idx+1}. {row['feature']:40s} {row['importance_pct']:6.2f}%")

print(f"\n3. BEST FEATURE GROUP:")
print(f"   - Group: {best_group['group']}")
print(f"   - C-index: {best_group['c_index']:.4f}")
print(f"   - Features: {best_group['n_features']}")

print(f"\n4. OPTIMAL NUMBER OF FEATURES:")
print(f"   - Top-{best_k['n_features']} features achieve C-index {best_k['c_index']:.4f}")
if best_k['n_features'] < len(feature_cols):
    improvement = (best_k['c_index'] - baseline_c_index) * 100
    reduction = (1 - best_k['n_features']/len(feature_cols)) * 100
    print(f"   - Uses {reduction:.1f}% fewer features")
    print(f"   - Performance change: {improvement:+.2f} percentage points")

print(f"\n5. MODEL COMPARISON:")
print(f"   - Random Survival Forest: {baseline_c_index:.4f}")
print(f"   - Gradient Boosting: {c_index_gbs:.4f}")
print(f"   - Difference: {abs(baseline_c_index - c_index_gbs):.4f}")

print("\n" + "=" * 80)

## 14. Next Steps and Recommendations

In [None]:
print("\n" + "=" * 80)
print("üí° RECOMMENDATIONS")
print("=" * 80)

print("\n1. FOR PRODUCTION DEPLOYMENT:")
if len(final_features) < len(feature_cols):
    print(f"   - Use Top-{len(final_features)} features (optimal balance)")
else:
    print(f"   - Use all {len(feature_cols)} features (best performance)")
print(f"   - Model: Random Survival Forest with 300 trees")
print(f"   - Expected C-index: {c_index_final:.4f}")

print("\n2. FURTHER OPTIMIZATION:")
print("   - Hyperparameter tuning (GridSearchCV)")
print("   - Ensemble: Combine RSF + GBS predictions")
print("   - Feature engineering: Create more interactions from top features")
print("   - Cross-validation: Verify performance stability")

print("\n3. DATA COLLECTION PRIORITIES:")
print("   Focus on collecting/improving:")
for idx, row in importance_df.head(3).iterrows():
    print(f"   - {row['feature']} ({row['importance_pct']:.1f}% importance)")

print("\n4. MODEL MONITORING:")
print("   - Track C-index on new data")
print("   - Retrain quarterly with updated POI data")
print("   - Monitor feature importance drift")

print("\n" + "=" * 80)
print("‚úÖ TRAINING COMPLETE - All results saved to:", OUTPUT_PATH)
print("=" * 80)