# Restaurant Survival - XGBoost GPU Training

**üöÄ GPU-Accelerated**: 10-50x faster than CPU-only methods

**Requirements**:
- Enable **GPU T4 x2** in Kaggle
- Upload `jakarta_restaurant_features_complete.csv`

**Expected Time**: 5-10 minutes (vs hours with scikit-survival)

In [None]:
# Install
!pip install -q xgboost scikit-survival

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time
import gc

import xgboost as xgb
from xgboost import DMatrix
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored

print("‚úÖ Imports complete")
print(f"   XGBoost version: {xgb.__version__}")

In [None]:
# Config
DATA_PATH = Path('/kaggle/input') if Path('/kaggle/input').exists() else Path('data')
OUTPUT_PATH = Path('/kaggle/working') if Path('/kaggle').exists() else Path('outputs')
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

XGBOOST_PARAMS = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist',
    'device': 'cuda',            # GPU!
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 5,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42
}

print(f"üìÅ Data: {DATA_PATH}")
print(f"üìÅ Output: {OUTPUT_PATH}")
print(f"üéØ Device: {XGBOOST_PARAMS['device']}")

In [None]:
# Load data
df = pd.read_csv(DATA_PATH / 'jakarta_restaurant_features_complete.csv')
print(f"‚úÖ Loaded {len(df):,} restaurants")

In [None]:
# Filter mature only
df_mature = df[df['categorical_label'] != 2].copy()

print(f"‚úÖ Mature: {len(df_mature):,}")
print(f"   Failed: {(df_mature['event_observed'] == 1).sum():,} ({(df_mature['event_observed'] == 1).mean():.1%})")

In [None]:
# Get features
exclude = ['osm_id', 'name', 'poi_type', 'date_created', 'date_closed', 
           'survival_days', 'event_observed', 'categorical_label', 'geometry', 'lat', 'lon']
feature_cols = [c for c in df_mature.columns if c not in exclude]

# Fill missing
df_mature[feature_cols] = df_mature[feature_cols].fillna(df_mature[feature_cols].median())

print(f"‚úÖ Features: {len(feature_cols)}")

In [None]:
# Prepare XGBoost survival data
# For XGBoost survival:cox, we need to create labels as: -survival_days for events, +survival_days for censored
# Negative = event occurred, Positive = censored

y_train_xgb = df_mature['survival_days'].copy().astype(float)
# Make negative for events (deaths/failures)
y_train_xgb[df_mature['event_observed'] == 1] *= -1

X = df_mature[feature_cols].values
events = df_mature['event_observed'].values

print(f"‚úÖ Data prepared for XGBoost survival:cox")
print(f"   X shape: {X.shape}")
print(f"   Label range: {y_train_xgb.min():.0f} to {y_train_xgb.max():.0f}")
print(f"   Events (negative): {(y_train_xgb < 0).sum():,}")
print(f"   Censored (positive): {(y_train_xgb > 0).sum():,}")

In [None]:
# Split
X_train, X_test, y_train, y_test, event_train, event_test = train_test_split(
    X, y_train_xgb, events, test_size=0.2, random_state=42, stratify=events
)

print(f"‚úÖ Split: Train {len(X_train):,} | Test {len(X_test):,}")
print(f"   Train events: {(y_train < 0).sum():,}")
print(f"   Test events: {(y_test < 0).sum():,}")

In [None]:
# Create DMatrix (simple format for survival:cox)
# For survival:cox, label is signed: negative = event, positive = censored
dtrain = DMatrix(X_train, label=y_train, feature_names=feature_cols)
dtest = DMatrix(X_test, label=y_test, feature_names=feature_cols)

print(f"‚úÖ DMatrix created (GPU-ready)")
print(f"   Train: {dtrain.num_row():,} x {dtrain.num_col()}")
print(f"   Test: {dtest.num_row():,} x {dtest.num_col()}")

## Training with ALL Features

In [None]:
print("üî• Training XGBoost (500 trees) on GPU...")
start = time.time()

model = xgb.train(
    XGBOOST_PARAMS,
    dtrain,
    num_boost_round=500,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    early_stopping_rounds=50,
    verbose_eval=50
)

elapsed = time.time() - start
print(f"\n‚úÖ Training done in {elapsed:.1f}s")
print(f"   Speed: {500/elapsed:.1f} trees/second")
print(f"   Best iteration: {model.best_iteration}")

In [None]:
# Predict
pred_train = model.predict(dtrain)
pred_test = model.predict(dtest)

# For C-index calculation, we need absolute values of labels
y_train_abs = np.abs(y_train)
y_test_abs = np.abs(y_test)

# C-index
c_train = concordance_index_censored(event_train.astype(bool), y_train_abs, pred_train)[0]
c_test = concordance_index_censored(event_test.astype(bool), y_test_abs, pred_test)[0]

print("\n" + "="*60)
print("üéØ RESULTS (ALL FEATURES)")
print("="*60)
print(f"\n   Train C-index: {c_train:.4f}")
print(f"   Test C-index:  {c_test:.4f}")
print(f"   Overfitting:   {c_train - c_test:.4f}")
print(f"\n   Features: {len(feature_cols)}")
print(f"   Trees: {model.best_iteration}")
print("\n" + "="*60)

## Feature Importance

In [None]:
# Get importance
importance_dict = model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False)

importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100

print("\nüìä Top 20 Features:")
print(importance_df.head(20)[['feature', 'importance_pct']].to_string(index=False))

# Save
importance_df.to_csv(OUTPUT_PATH / 'feature_importance.csv', index=False)

In [None]:
# Plot
plt.figure(figsize=(10, 8))
top20 = importance_df.head(20)
plt.barh(range(len(top20)), top20['importance_pct'])
plt.yticks(range(len(top20)), top20['feature'], fontsize=9)
plt.xlabel('Importance (%)')
plt.title('Top 20 Features - XGBoost Survival')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'importance.png', dpi=150, bbox_inches='tight')
plt.show()

## Test Top-K Features

In [None]:
print("üî• Testing different K values...")

k_values = [10, 20, 30, 40, 50]
results = []

for k in k_values:
    print(f"\n  K={k}...", end=" ")
    start = time.time()
    
    # Select top k
    top_k = importance_df.head(k)['feature'].tolist()
    k_indices = [feature_cols.index(f) for f in top_k]
    
    # Create DMatrix
    dtrain_k = DMatrix(X_train[:, k_indices], label=y_train)
    dtest_k = DMatrix(X_test[:, k_indices], label=y_test)
    
    # Train
    model_k = xgb.train(
        XGBOOST_PARAMS,
        dtrain_k,
        num_boost_round=300,
        early_stopping_rounds=30,
        evals=[(dtest_k, 'test')],
        verbose_eval=False
    )
    
    # Evaluate
    pred_k = model_k.predict(dtest_k)
    y_test_abs = np.abs(y_test)
    c_k = concordance_index_censored(event_test.astype(bool), y_test_abs, pred_k)[0]
    
    elapsed = time.time() - start
    results.append({'k': k, 'c_index': c_k, 'time_s': elapsed})
    
    print(f"C-index: {c_k:.4f} ({elapsed:.1f}s)")
    
    del model_k, dtrain_k, dtest_k
    gc.collect()

df_results = pd.DataFrame(results)
print("\nüìä Results:")
print(df_results.to_string(index=False))

df_results.to_csv(OUTPUT_PATH / 'top_k_results.csv', index=False)

best = df_results.loc[df_results['c_index'].idxmax()]
print(f"\nüèÜ Best: k={int(best['k'])} ‚Üí C-index={best['c_index']:.4f}")

## Final Summary

In [None]:
print("\n" + "="*60)
print("üìù FINAL SUMMARY")
print("="*60)

print(f"\n‚úÖ Dataset:")
print(f"   Restaurants: {len(df_mature):,}")
print(f"   Failures: {events.sum():,} ({events.mean():.1%})")
print(f"   Features: {len(feature_cols)}")

print(f"\n‚úÖ Performance:")
print(f"   All features C-index: {c_test:.4f}")
print(f"   Best K={int(best['k'])} C-index: {best['c_index']:.4f}")

print(f"\n‚úÖ Top 5 Features:")
for i, row in importance_df.head(5).iterrows():
    print(f"   {i+1}. {row['feature']:40s} ({row['importance_pct']:.2f}%)")

print(f"\n‚úÖ Training Speed:")
print(f"   GPU-accelerated: {elapsed:.1f}s for 500 trees")
print(f"   Speed: {500/elapsed:.1f} trees/second")

print("\n" + "="*60)
print("üéâ TRAINING COMPLETE!")
print("="*60)