# Restaurant Survival - GBS with Proven Features

**üéØ Goal**: Reproduce C-index 0.76 using Gradient Boosting Survival

**Strategy**: Use proven 8 features from thematic experiment

**Dataset**: 72,082 mature restaurants (5.5% failure)

**Model**: Gradient Boosting Survival Analysis (faster than RSF)

**Expected Time**: 5-10 minutes (CPU only)

In [None]:
!pip install -q scikit-survival

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import json

from sksurv.ensemble import GradientBoostingSurvivalAnalysis, RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sksurv.util import Surv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("‚úÖ Imports complete")

In [None]:
# Paths
DATA_PATH = Path('/kaggle/input/jakarta-restaurant-features-complete')
OUTPUT_PATH = Path('/kaggle/working')
OUTPUT_PATH.mkdir(exist_ok=True)

print(f"üìÅ Data: {DATA_PATH}")
print(f"üìÅ Output: {OUTPUT_PATH}")

In [None]:
# Load data
df = pd.read_csv(DATA_PATH / 'jakarta_restaurant_features_complete.csv')

print(f"‚úÖ Loaded: {len(df):,} restaurants")
print(f"   Failures: {(df['event_observed'] == 1).sum():,} ({(df['event_observed'] == 1).mean():.1%})")
print(f"   Successes: {(df['event_observed'] == 0).sum():,} ({(df['event_observed'] == 0).mean():.1%})")
print(f"\n‚ö†Ô∏è  Imbalance ratio: {(df['event_observed'] == 0).sum() / (df['event_observed'] == 1).sum():.1f}:1")

## Define Proven Features

Based on Thematic Experiment (C-index 0.7599):
- 8 features with 5km buffer
- Focus on competition, demographics, accessibility

In [None]:
# Proven 8 features from thematic experiment
PROVEN_FEATURES_8 = [
    'competitors_count_5000m',    # Competition in 5km (was competitors_5000m)
    'nearest_competitor_m',        # Distance to nearest competitor
    'density_district',            # Population density
    'income_district_m',           # District income
    'working_age_district',        # Working age population
    'transport_count_5000m',       # Transport access in 5km
    'dist_city_center_km',         # Distance to city center
    'pasar_count_5000m'            # Traditional markets in 5km
]

# Alternative: Top features from Feature Importance Analysis
TOP_FEATURES_10 = [
    'pasar_count_1000m',           # 89.5% importance - DOMINANT!
    'transport_density_1km',       # 67% importance
    'transport_count_1000m',       # 56% importance
    'working_age_district',        # 48% importance
    'density_district',            # 45% importance
    'nearest_competitor_m',        # Competition factor
    'dist_city_center_km',         # Accessibility
    'income_district_m',           # Income level
    'competitors_count_5000m',     # Competition count
    'office_transport'             # Interaction feature
]

print("üìã Feature Sets Defined:")
print(f"   Set 1 (Proven 8): {len(PROVEN_FEATURES_8)} features")
print(f"   Set 2 (Top 10): {len(TOP_FEATURES_10)} features")

In [None]:
# Check which features exist
all_features = list(df.columns)

print("\nüîç Checking Proven 8 Features:")
for f in PROVEN_FEATURES_8:
    status = "‚úì" if f in all_features else "‚úó"
    print(f"   {status} {f}")

print("\nüîç Checking Top 10 Features:")
for f in TOP_FEATURES_10:
    status = "‚úì" if f in all_features else "‚úó"
    print(f"   {status} {f}")

In [None]:
# Filter valid features only
proven_8_valid = [f for f in PROVEN_FEATURES_8 if f in all_features]
top_10_valid = [f for f in TOP_FEATURES_10 if f in all_features]

print(f"‚úÖ Valid features:")
print(f"   Proven 8: {len(proven_8_valid)}/{len(PROVEN_FEATURES_8)}")
print(f"   Top 10: {len(top_10_valid)}/{len(TOP_FEATURES_10)}")

## Prepare Data for Survival Analysis

In [None]:
# Function to prepare and train model
def train_survival_model(df, feature_list, model_type='GBS', name=""):
    """
    Train survival model and return results
    
    Args:
        df: DataFrame with features and survival data
        feature_list: List of feature names to use
        model_type: 'GBS' or 'RSF'
        name: Name for this run
    """
    print(f"\n{'='*70}")
    print(f"üî• TRAINING: {name}")
    print(f"   Model: {model_type}")
    print(f"   Features: {len(feature_list)}")
    print(f"{'='*70}")
    
    # Fill missing values
    df_model = df.copy()
    df_model[feature_list] = df_model[feature_list].fillna(df_model[feature_list].median())
    
    # Create survival array
    y = Surv.from_arrays(
        event=df_model['event_observed'].astype(bool),
        time=df_model['survival_days']
    )
    
    X = df_model[feature_list].values
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=df_model['event_observed']
    )
    
    print(f"\n‚úÖ Data split: Train {len(X_train):,} | Test {len(X_test):,}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    start = time.time()
    
    if model_type == 'GBS':
        model = GradientBoostingSurvivalAnalysis(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            min_samples_split=10,
            subsample=0.8,
            random_state=42,
            verbose=0
        )
    else:  # RSF
        model = RandomSurvivalForest(
            n_estimators=100,
            min_samples_split=10,
            min_samples_leaf=5,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
    
    print(f"\n‚è≥ Training {model_type}...")
    model.fit(X_train_scaled, y_train)
    elapsed = time.time() - start
    
    # Predict
    pred_train = model.predict(X_train_scaled)
    pred_test = model.predict(X_test_scaled)
    
    # C-index
    c_train = concordance_index_censored(y_train['event'], y_train['time'], pred_train)[0]
    c_test = concordance_index_censored(y_test['event'], y_test['time'], pred_test)[0]
    
    print(f"\n‚úÖ Training complete in {elapsed:.1f}s")
    print(f"\nüìä RESULTS:")
    print(f"   Train C-index: {c_train:.4f}")
    print(f"   Test C-index:  {c_test:.4f}")
    print(f"   Overfitting:   {c_train - c_test:.4f}")
    
    # Feature importance (if available)
    importance = None
    if hasattr(model, 'feature_importances_'):
        importance = model.feature_importances_
        importance_df = pd.DataFrame({
            'feature': feature_list,
            'importance': importance
        }).sort_values('importance', ascending=False)
        importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100
        
        print(f"\nüìä Top 5 Features:")
        for i, row in importance_df.head(5).iterrows():
            print(f"   {row['feature']:30s} {row['importance_pct']:6.2f}%")
    
    return {
        'name': name,
        'model_type': model_type,
        'n_features': len(feature_list),
        'features': feature_list,
        'c_train': c_train,
        'c_test': c_test,
        'time_s': elapsed,
        'model': model,
        'scaler': scaler,
        'importance': importance_df if importance is not None else None
    }

## Experiment 1: Proven 8 Features with GBS

In [None]:
result_proven_gbs = train_survival_model(
    df, 
    proven_8_valid, 
    model_type='GBS',
    name="Proven 8 Features (GBS)"
)

## Experiment 2: Proven 8 Features with RSF

In [None]:
result_proven_rsf = train_survival_model(
    df, 
    proven_8_valid, 
    model_type='RSF',
    name="Proven 8 Features (RSF)"
)

## Experiment 3: Top 10 Features with GBS

In [None]:
result_top10_gbs = train_survival_model(
    df, 
    top_10_valid, 
    model_type='GBS',
    name="Top 10 Features (GBS)"
)

## Experiment 4: Top 10 Features with RSF

In [None]:
result_top10_rsf = train_survival_model(
    df, 
    top_10_valid, 
    model_type='RSF',
    name="Top 10 Features (RSF)"
)

## Comparison Summary

In [None]:
# Compile results
results = [
    result_proven_gbs,
    result_proven_rsf,
    result_top10_gbs,
    result_top10_rsf
]

df_results = pd.DataFrame([{
    'name': r['name'],
    'model': r['model_type'],
    'features': r['n_features'],
    'c_train': r['c_train'],
    'c_test': r['c_test'],
    'overfitting': r['c_train'] - r['c_test'],
    'time_s': r['time_s']
} for r in results])

print("\n" + "="*80)
print("üìä FINAL COMPARISON")
print("="*80)
print(df_results.to_string(index=False))

# Save results
df_results.to_csv(OUTPUT_PATH / 'comparison_results.csv', index=False)

# Best model
best_idx = df_results['c_test'].idxmax()
best = df_results.iloc[best_idx]

print(f"\nüèÜ BEST MODEL:")
print(f"   Name: {best['name']}")
print(f"   C-index: {best['c_test']:.4f}")
print(f"   Time: {best['time_s']:.1f}s")

## Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. C-index comparison
ax1 = axes[0]
x = range(len(df_results))
ax1.barh(x, df_results['c_test'], alpha=0.7)
ax1.set_yticks(x)
ax1.set_yticklabels(df_results['name'], fontsize=9)
ax1.set_xlabel('Test C-index')
ax1.set_title('Model Performance Comparison')
ax1.axvline(x=0.76, color='g', linestyle='--', alpha=0.5, label='Target 0.76')
ax1.legend()
ax1.invert_yaxis()

# 2. Feature importance (best model)
ax2 = axes[1]
best_result = results[best_idx]
if best_result['importance'] is not None:
    top_imp = best_result['importance'].head(10)
    ax2.barh(range(len(top_imp)), top_imp['importance_pct'])
    ax2.set_yticks(range(len(top_imp)))
    ax2.set_yticklabels(top_imp['feature'], fontsize=9)
    ax2.set_xlabel('Importance (%)')
    ax2.set_title(f'Top 10 Features ({best["name"]})')
    ax2.invert_yaxis()

plt.tight_layout()
plt.savefig(OUTPUT_PATH / 'comparison_visualization.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Visualization saved")

## Save Best Model

In [None]:
# Save best model configuration
best_result = results[best_idx]

config = {
    'name': best_result['name'],
    'model_type': best_result['model_type'],
    'n_features': best_result['n_features'],
    'features': best_result['features'],
    'c_train': float(best_result['c_train']),
    'c_test': float(best_result['c_test']),
    'time_s': float(best_result['time_s']),
    'dataset': {
        'total': len(df),
        'failures': int((df['event_observed'] == 1).sum()),
        'failure_rate': float((df['event_observed'] == 1).mean())
    },
    'target_c_index': 0.76,
    'achieved': best_result['c_test'] >= 0.70
}

# Save feature importance
if best_result['importance'] is not None:
    best_result['importance'].to_csv(OUTPUT_PATH / 'best_model_feature_importance.csv', index=False)
    config['top_5_features'] = best_result['importance'].head(5)['feature'].tolist()

with open(OUTPUT_PATH / 'best_model_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f"\n‚úÖ Best model configuration saved")

## Final Summary

In [None]:
print("\n" + "="*80)
print("üìù FINAL SUMMARY")
print("="*80)

print(f"\n‚úÖ Dataset:")
print(f"   Total: {len(df):,} restaurants")
print(f"   Failures: {(df['event_observed'] == 1).sum():,} ({(df['event_observed'] == 1).mean():.1%})")
print(f"   Imbalance: {(df['event_observed'] == 0).sum() / (df['event_observed'] == 1).sum():.1f}:1")

print(f"\n‚úÖ Experiments Run: {len(results)}")
print(f"   1. Proven 8 + GBS: {result_proven_gbs['c_test']:.4f}")
print(f"   2. Proven 8 + RSF: {result_proven_rsf['c_test']:.4f}")
print(f"   3. Top 10 + GBS: {result_top10_gbs['c_test']:.4f}")
print(f"   4. Top 10 + RSF: {result_top10_rsf['c_test']:.4f}")

print(f"\nüèÜ Best Result:")
print(f"   Model: {best['name']}")
print(f"   C-index: {best['c_test']:.4f}")
print(f"   Target: 0.7600")
print(f"   Gap: {0.76 - best['c_test']:.4f}")

print("\n" + "="*80)
if best['c_test'] >= 0.76:
    print("‚úÖ SUCCESS: C-index ‚â• 0.76 (Target achieved!)")
elif best['c_test'] >= 0.70:
    print("‚úÖ GOOD: C-index ‚â• 0.70 (Close to target!)")
elif best['c_test'] >= 0.60:
    print("‚ö†Ô∏è  MODERATE: C-index 0.60-0.70 (Needs improvement)")
else:
    print("‚ùå POOR: C-index < 0.60 (Significant improvement needed)")
print("="*80)

print("\nüéâ TRAINING COMPLETE!")