# Restaurant Survival - XGBoost Progressive Feature Training

**üöÄ GPU-Accelerated Progressive Training**

**Strategy**: Start from proven features, add progressively

**Dataset**: 72,082 restaurants (5.5% failure - imbalanced!)

**Phases**:
1. Baseline: 8 proven features (Target C-index >0.70)
2. +Indonesia features (Target +5-10%)
3. +Interaction features (Target +2-5%)
4. +Entropy features (Target +5%)
5. Test 5 imbalance strategies

**Expected Time**: 20-30 minutes total

In [None]:
!pip install -q xgboost scikit-survival

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import gc
import json

import xgboost as xgb
from xgboost import DMatrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sksurv.metrics import concordance_index_censored

print("‚úÖ Imports complete")
print(f"   XGBoost: {xgb.__version__}")

In [None]:
# Paths
DATA_PATH = Path('/kaggle/input/jakarta-restaurant-features-complete')
OUTPUT_PATH = Path('/kaggle/working')
OUTPUT_PATH.mkdir(exist_ok=True)

# XGBoost base config (will modify for imbalance strategies)
XGBOOST_BASE = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist',
    'device': 'cuda',
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 5,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42
}

print(f"üìÅ Data: {DATA_PATH}")
print(f"üìÅ Output: {OUTPUT_PATH}")
print(f"üéØ Device: {XGBOOST_BASE['device']}")

In [None]:
# Load data
df = pd.read_csv(DATA_PATH / 'jakarta_restaurant_features_complete.csv')
df_mature = df[df['categorical_label'] != 2].copy()

print(f"‚úÖ Loaded: {len(df_mature):,} mature restaurants")
print(f"   Failures: {(df_mature['event_observed'] == 1).sum():,} ({(df_mature['event_observed'] == 1).mean():.1%})")
print(f"   Successes: {(df_mature['event_observed'] == 0).sum():,} ({(df_mature['event_observed'] == 0).mean():.1%})")
print(f"\n‚ö†Ô∏è  Imbalance ratio: {(df_mature['event_observed'] == 0).sum() / (df_mature['event_observed'] == 1).sum():.1f}:1")

In [None]:
# Define feature groups for progressive training
FEATURE_PHASES = {
    'Phase 1 - Proven Core (8)': [
        'nearest_gas_station_m',      # 80% importance (SPBU)
        'transport_density_1km',       # 67% importance
        'transport_count_1000m',       # 56% importance  
        'working_age_district',        # 48% importance
        'density_district',            # 45% importance
        'competitors_count_5000m',     # Competition
        'dist_city_center_km',         # Accessibility
        'pasar_count_5000m'            # Indonesia-specific (5km buffer)
    ],
    
    'Phase 2 - Indonesia Features (10)': [
        'mosque_count_500m',
        'mosque_count_1000m', 
        'nearest_mosque_m',
        'pasar_count_1000m',
        'nearest_pasar_m',
        'convenience_count_1000m',
        'gas_station_count_2000m',
        'friday_prayer_impact',
        'pasar_proximity_score',
        'gas_proximity_score'
    ],
    
    'Phase 3 - Interactions (8)': [
        'income_pop_interaction',
        'working_age_mall_inv',
        'office_transport',
        'demand_supply_ratio',
        'mosque_residential',
        'pasar_transport',
        'cannibalization_risk_500m',
        'urban_centrality'
    ],
    
    'Phase 4 - Entropy (3)': [
        'entropy_500m',
        'entropy_1000m',
        'entropy_2000m'
    ]
}

print("üìã Feature Phases Defined:")
total = 0
for phase, features in FEATURE_PHASES.items():
    total += len(features)
    print(f"   {phase}: {len(features)} features (cumulative: {total})")

In [None]:
# Prepare base data
exclude = ['osm_id', 'name', 'poi_type', 'date_created', 'date_closed',
           'survival_days', 'event_observed', 'categorical_label', 'geometry', 'lat', 'lon']

all_features = [c for c in df_mature.columns if c not in exclude]

# Fill missing
df_mature[all_features] = df_mature[all_features].fillna(df_mature[all_features].median())

# Prepare XGBoost survival labels (negative = event, positive = censored)
y_xgb = df_mature['survival_days'].copy().astype(float)
y_xgb[df_mature['event_observed'] == 1] *= -1

events = df_mature['event_observed'].values
y_abs = np.abs(y_xgb)  # For C-index calculation

print(f"‚úÖ Data prepared")
print(f"   Total features available: {len(all_features)}")
print(f"   Label range: {y_xgb.min():.0f} to {y_xgb.max():.0f}")

## Helper Functions

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test, event_test, y_test_abs, 
                       feature_names, params, n_rounds=300, phase_name=""):
    """Train XGBoost and return results"""
    
    start = time.time()
    
    # Create DMatrix
    dtrain = DMatrix(X_train, label=y_train, feature_names=feature_names)
    dtest = DMatrix(X_test, label=y_test, feature_names=feature_names)
    
    # Train
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=n_rounds,
        evals=[(dtest, 'test')],
        early_stopping_rounds=30,
        verbose_eval=False
    )
    
    # Predict
    pred = model.predict(dtest)
    
    # C-index
    c_index = concordance_index_censored(event_test.astype(bool), y_test_abs, pred)[0]
    
    elapsed = time.time() - start
    
    # Feature importance
    importance = model.get_score(importance_type='gain')
    
    result = {
        'phase': phase_name,
        'n_features': len(feature_names),
        'c_index': c_index,
        'best_iteration': model.best_iteration,
        'time_s': elapsed,
        'model': model,
        'importance': importance
    }
    
    print(f"   C-index: {c_index:.4f} | Trees: {model.best_iteration} | Time: {elapsed:.1f}s")
    
    return result

## Progressive Training - Standard (No Imbalance Handling)

In [None]:
print("üî• PROGRESSIVE FEATURE TRAINING (Standard - No Weighting)\n")
print("="*70)

progressive_results = []
cumulative_features = []

for phase_name, phase_features in FEATURE_PHASES.items():
    
    # Add features cumulatively
    cumulative_features.extend(phase_features)
    
    # Check if features exist
    valid_features = [f for f in cumulative_features if f in all_features]
    missing = [f for f in cumulative_features if f not in all_features]
    
    if missing:
        print(f"\n‚ö†Ô∏è  {phase_name}: {len(missing)} features not found:")
        for f in missing[:5]:
            print(f"     - {f}")
        if len(missing) > 5:
            print(f"     ... and {len(missing)-5} more")
    
    if not valid_features:
        print(f"\n‚ùå {phase_name}: No valid features! Skipping...")
        continue
    
    print(f"\n{phase_name}")
    print(f"  Features: {len(valid_features)} (added {len(phase_features)})")
    
    # Get feature indices
    indices = [all_features.index(f) for f in valid_features]
    X = df_mature[valid_features].values
    
    # Split
    X_train, X_test, y_train, y_test, event_train, event_test = train_test_split(
        X, y_xgb, events, test_size=0.2, random_state=42, stratify=events
    )
    
    y_test_abs = np.abs(y_test)
    
    # Train
    result = train_and_evaluate(
        X_train, X_test, y_train, y_test, event_test, y_test_abs,
        valid_features, XGBOOST_BASE, n_rounds=300, phase_name=phase_name
    )
    
    progressive_results.append(result)
    
    # Cleanup
    gc.collect()

print("\n" + "="*70)
print("‚úÖ Progressive training complete!")

In [None]:
# Summary table
df_progressive = pd.DataFrame([{
    'phase': r['phase'],
    'n_features': r['n_features'],
    'c_index': r['c_index'],
    'trees': r['best_iteration'],
    'time_s': r['time_s']
} for r in progressive_results])

print("\nüìä Progressive Results Summary:")
print("="*70)
print(df_progressive.to_string(index=False))

# Save
df_progressive.to_csv(OUTPUT_PATH / 'progressive_results_standard.csv', index=False)

# Best phase
best_idx = df_progressive['c_index'].idxmax()
best = df_progressive.iloc[best_idx]

print(f"\nüèÜ Best Phase: {best['phase']}")
print(f"   Features: {int(best['n_features'])}")
print(f"   C-index: {best['c_index']:.4f}")

## Phase 5: Test Imbalance Strategies

Test 5 strategies with the BEST feature set from progressive training

In [None]:
# Use best feature set
best_features_idx = best_idx
best_features = [f for f in cumulative_features if f in all_features]

print(f"üî• TESTING 5 IMBALANCE STRATEGIES")
print(f"   Using: {len(best_features)} features from {progressive_results[best_features_idx]['phase']}")
print(f"\n" + "="*70)

# Prepare data
X = df_mature[best_features].values
X_train, X_test, y_train, y_test, event_train, event_test = train_test_split(
    X, y_xgb, events, test_size=0.2, random_state=42, stratify=events
)
y_test_abs = np.abs(y_test)

# Calculate imbalance ratio
failure_rate = event_train.mean()
success_rate = 1 - failure_rate
scale_pos_weight = success_rate / failure_rate

print(f"   Imbalance: {success_rate:.1%} success / {failure_rate:.1%} failure")
print(f"   Scale weight: {scale_pos_weight:.2f}\n")

imbalance_results = []

In [None]:
# Strategy 1: Standard (No weighting)
print("1Ô∏è‚É£  Standard (No Imbalance Handling)")

result = train_and_evaluate(
    X_train, X_test, y_train, y_test, event_test, y_test_abs,
    best_features, XGBOOST_BASE, phase_name="Standard"
)
imbalance_results.append(result)

In [None]:
# Strategy 2: Weighted (scale_pos_weight)
print("\n2Ô∏è‚É£  Weighted (scale_pos_weight)")

params_weighted = XGBOOST_BASE.copy()
params_weighted['scale_pos_weight'] = scale_pos_weight

result = train_and_evaluate(
    X_train, X_test, y_train, y_test, event_test, y_test_abs,
    best_features, params_weighted, phase_name="Weighted"
)
imbalance_results.append(result)

In [None]:
# Strategy 3: Sample Weights (manual)
print("\n3Ô∏è‚É£  Sample Weights (Manual)")

# Compute sample weights
sample_weights = compute_sample_weight('balanced', event_train)

start = time.time()
dtrain_w = DMatrix(X_train, label=y_train, weight=sample_weights, feature_names=best_features)
dtest_w = DMatrix(X_test, label=y_test, feature_names=best_features)

model_w = xgb.train(
    XGBOOST_BASE,
    dtrain_w,
    num_boost_round=300,
    evals=[(dtest_w, 'test')],
    early_stopping_rounds=30,
    verbose_eval=False
)

pred_w = model_w.predict(dtest_w)
c_w = concordance_index_censored(event_test.astype(bool), y_test_abs, pred_w)[0]
elapsed = time.time() - start

result = {
    'phase': 'Sample Weights',
    'n_features': len(best_features),
    'c_index': c_w,
    'best_iteration': model_w.best_iteration,
    'time_s': elapsed,
    'model': model_w,
    'importance': model_w.get_score(importance_type='gain')
}

print(f"   C-index: {c_w:.4f} | Trees: {model_w.best_iteration} | Time: {elapsed:.1f}s")
imbalance_results.append(result)

In [None]:
# Strategy 4: Undersampling
print("\n4Ô∏è‚É£  Undersampling (Balance 1:3)")

# Convert to numpy arrays if pandas Series
y_train_np = y_train.values if hasattr(y_train, 'values') else y_train
y_test_np = y_test.values if hasattr(y_test, 'values') else y_test

# Sample to balance
failures_idx = np.where(event_train == 1)[0]
successes_idx = np.where(event_train == 0)[0]

# Keep all failures, undersample successes (1:3 ratio)
n_failures = len(failures_idx)
n_successes_sample = n_failures * 3

np.random.seed(42)
successes_sampled = np.random.choice(successes_idx, size=n_successes_sample, replace=False)

balanced_idx = np.concatenate([failures_idx, successes_sampled])
np.random.shuffle(balanced_idx)

X_train_bal = X_train[balanced_idx]
y_train_bal = y_train_np[balanced_idx]

print(f"   Original: {len(X_train):,} | Balanced: {len(X_train_bal):,}")

result = train_and_evaluate(
    X_train_bal, X_test, y_train_bal, y_test_np, event_test, y_test_abs,
    best_features, XGBOOST_BASE, phase_name="Undersampled"
)
imbalance_results.append(result)

In [None]:
# Strategy 5: Hybrid (Weighted + Undersampled)
print("\n5Ô∏è‚É£  Hybrid (Weighted + Undersampled)")

params_hybrid = XGBOOST_BASE.copy()
params_hybrid['scale_pos_weight'] = 3.0  # Moderate weight for already balanced data

result = train_and_evaluate(
    X_train_bal, X_test, y_train_bal, y_test_np, event_test, y_test_abs,
    best_features, params_hybrid, phase_name="Hybrid"
)
imbalance_results.append(result)

print("\n" + "="*70)

In [None]:
# Imbalance strategy comparison
df_imbalance = pd.DataFrame([{
    'strategy': r['phase'],
    'c_index': r['c_index'],
    'trees': r['best_iteration'],
    'time_s': r['time_s']
} for r in imbalance_results])

print("\nüìä Imbalance Strategy Comparison:")
print("="*70)
print(df_imbalance.to_string(index=False))

df_imbalance.to_csv(OUTPUT_PATH / 'imbalance_strategy_results.csv', index=False)

best_strategy_idx = df_imbalance['c_index'].idxmax()
best_strategy = df_imbalance.iloc[best_strategy_idx]

print(f"\nüèÜ Best Strategy: {best_strategy['strategy']}")
print(f"   C-index: {best_strategy['c_index']:.4f}")
print(f"   Improvement: +{(best_strategy['c_index'] - df_imbalance.iloc[0]['c_index'])*100:.2f}% vs Standard")

## Feature Importance Analysis

In [None]:
# Get best model
best_model = imbalance_results[best_strategy_idx]['model']
importance_dict = best_model.get_score(importance_type='gain')

importance_df = pd.DataFrame({
    'feature': list(importance_dict.keys()),
    'importance': list(importance_dict.values())
}).sort_values('importance', ascending=False)

importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100

print("\nüìä Top 20 Feature Importance (Best Model):")
print("="*70)
print(importance_df.head(20)[['feature', 'importance_pct']].to_string(index=False))

importance_df.to_csv(OUTPUT_PATH / 'final_feature_importance.csv', index=False)

## Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Progressive C-index
ax1 = axes[0, 0]
ax1.plot(range(len(df_progressive)), df_progressive['c_index'], marker='o', linewidth=2, markersize=8)
ax1.set_xticks(range(len(df_progressive)))
ax1.set_xticklabels([p.replace('Phase ', 'P') for p in df_progressive['phase']], rotation=45, ha='right')
ax1.set_ylabel('C-index')
ax1.set_title('Progressive Feature Addition Performance')
ax1.grid(True, alpha=0.3)
ax1.axhline(y=0.7, color='g', linestyle='--', alpha=0.5, label='Target 0.70')
ax1.legend()

# 2. Imbalance strategies
ax2 = axes[0, 1]
ax2.barh(range(len(df_imbalance)), df_imbalance['c_index'])
ax2.set_yticks(range(len(df_imbalance)))
ax2.set_yticklabels(df_imbalance['strategy'])
ax2.set_xlabel('C-index')
ax2.set_title('Imbalance Strategy Comparison')
ax2.invert_yaxis()

# 3. Top 15 features
ax3 = axes[1, 0]
top15 = importance_df.head(15)
ax3.barh(range(len(top15)), top15['importance_pct'])
ax3.set_yticks(range(len(top15)))
ax3.set_yticklabels(top15['feature'], fontsize=9)
ax3.set_xlabel('Importance (%)')
ax3.set_title('Top 15 Features (Best Model)')
ax3.invert_yaxis()

# 4. Features vs C-index
ax4 = axes[1, 1]
ax4.scatter(df_progressive['n_features'], df_progressive['c_index'], s=100, alpha=0.6)
ax4.set_xlabel('Number of Features')
ax4.set_ylabel('C-index')
ax4.set_title('Features vs Performance')
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'training_summary.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Visualizations saved")

## Final Summary & Recommendations

In [None]:
print("\n" + "="*70)
print("üìù FINAL SUMMARY")
print("="*70)

print(f"\n‚úÖ Dataset:")
print(f"   Total: {len(df_mature):,} mature restaurants")
print(f"   Failures: {(df_mature['event_observed'] == 1).sum():,} (5.5%)")
print(f"   Imbalance: {scale_pos_weight:.1f}:1")

print(f"\n‚úÖ Best Configuration:")
print(f"   Features: {int(best['n_features'])} ({best['phase']})")
print(f"   Strategy: {best_strategy['strategy']}")
print(f"   C-index: {best_strategy['c_index']:.4f}")

print(f"\n‚úÖ Top 5 Most Important Features:")
for i, row in importance_df.head(5).iterrows():
    print(f"   {i+1}. {row['feature']:40s} ({row['importance_pct']:.2f}%)")

print(f"\n‚úÖ Performance Progression:")
for i, row in df_progressive.iterrows():
    print(f"   {row['phase']:40s}: {row['c_index']:.4f}")

print("\n" + "="*70)

if best_strategy['c_index'] >= 0.70:
    print("‚úÖ SUCCESS: C-index ‚â• 0.70 (Good discriminative power!)")
elif best_strategy['c_index'] >= 0.60:
    print("‚ö†Ô∏è  MODERATE: C-index 0.60-0.70 (Acceptable but needs improvement)")
else:
    print("‚ùå POOR: C-index < 0.60 (Model needs significant improvement)")

print("="*70)

In [None]:
# Save final config
final_config = {
    'best_phase': best['phase'],
    'best_features': best_features,
    'n_features': int(best['n_features']),
    'best_strategy': best_strategy['strategy'],
    'c_index': float(best_strategy['c_index']),
    'model_params': params_hybrid if best_strategy['strategy'] == 'Hybrid' else XGBOOST_BASE,
    'imbalance_ratio': float(scale_pos_weight),
    'top_5_features': importance_df.head(5)['feature'].tolist()
}

with open(OUTPUT_PATH / 'final_model_config.json', 'w') as f:
    json.dump(final_config, f, indent=2)

print("\n‚úÖ Final configuration saved to final_model_config.json")
print("\nüéâ TRAINING COMPLETE!")