# Restaurant Survival - Balanced Training

**Problem**: Extreme imbalance (94.5% success, 5.5% failure)

**Solution**: 
1. Use **stratified sampling** to balance
2. Train on top features only (faster)
3. Use Gradient Boosting Survival (faster than RSF)

In [None]:
!pip install -q scikit-survival

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time

from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.util import Surv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("‚úÖ Imports complete")

In [None]:
# Paths
DATA_PATH = Path('/kaggle/input/jakarta-restaurant-features-complete')
OUTPUT_PATH = Path('/kaggle/working')
OUTPUT_PATH.mkdir(exist_ok=True)

# Config
BALANCE_RATIO = 0.3  # Keep 30% of successes relative to failures
TOP_K_FEATURES = 30  # Use top 30 features only

GBS_CONFIG = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 5,
    'min_samples_split': 10,
    'subsample': 0.8,
    'random_state': 42
}

print(f"üìÅ Data: {DATA_PATH}")
print(f"üéØ Balance ratio: {BALANCE_RATIO}")
print(f"üéØ Top features: {TOP_K_FEATURES}")

In [None]:
# Load
df = pd.read_csv(DATA_PATH / 'jakarta_restaurant_features_complete.csv')
df_mature = df[df['categorical_label'] != 2].copy()

print(f"‚úÖ Loaded: {len(df_mature):,}")
print(f"   Failures: {(df_mature['event_observed'] == 1).sum():,}")
print(f"   Successes: {(df_mature['event_observed'] == 0).sum():,}")

In [None]:
# Balance dataset using undersampling
failures = df_mature[df_mature['event_observed'] == 1]
successes = df_mature[df_mature['event_observed'] == 0]

# Sample successes to balance
n_failures = len(failures)
n_successes_sample = int(n_failures / BALANCE_RATIO)

successes_sampled = successes.sample(n=min(n_successes_sample, len(successes)), random_state=42)

# Combine
df_balanced = pd.concat([failures, successes_sampled], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\n‚úÖ Balanced dataset:")
print(f"   Total: {len(df_balanced):,}")
print(f"   Failures: {(df_balanced['event_observed'] == 1).sum():,} ({(df_balanced['event_observed'] == 1).mean():.1%})")
print(f"   Successes: {(df_balanced['event_observed'] == 0).sum():,} ({(df_balanced['event_observed'] == 0).mean():.1%})")

In [None]:
# Get features
exclude = ['osm_id', 'name', 'poi_type', 'date_created', 'date_closed',
           'survival_days', 'event_observed', 'categorical_label', 'geometry', 'lat', 'lon']
feature_cols = [c for c in df_balanced.columns if c not in exclude]

# Fill missing
df_balanced[feature_cols] = df_balanced[feature_cols].fillna(df_balanced[feature_cols].median())

print(f"‚úÖ Features: {len(feature_cols)}")

In [None]:
# Create survival arrays
y = Surv.from_arrays(
    event=df_balanced['event_observed'].astype(bool),
    time=df_balanced['survival_days']
)

X = df_balanced[feature_cols].values

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df_balanced['event_observed']
)

print(f"‚úÖ Split: Train {len(X_train):,} | Test {len(X_test):,}")

In [None]:
# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Scaled")

## Train with ALL Features First

In [None]:
print("üî• Training GBS with ALL features (balanced data)...")
start = time.time()

gbs_all = GradientBoostingSurvivalAnalysis(**GBS_CONFIG)
gbs_all.fit(X_train_scaled, y_train)

pred_all = gbs_all.predict(X_test_scaled)
c_all = concordance_index_censored(y_test['event'], y_test['time'], pred_all)[0]

elapsed = time.time() - start

print(f"\n‚úÖ Done in {elapsed:.1f}s")
print(f"   C-index: {c_all:.4f}")

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': gbs_all.feature_importances_
}).sort_values('importance', ascending=False)

importance_df['importance_pct'] = importance_df['importance'] / importance_df['importance'].sum() * 100

print("\nüìä Top 20 Features:")
print(importance_df.head(20)[['feature', 'importance_pct']].to_string(index=False))

importance_df.to_csv(OUTPUT_PATH / 'feature_importance_balanced.csv', index=False)

## Test Top-K Features

In [None]:
print("\nüî• Testing Top-K features...")

k_values = [10, 20, 30, 40, 50]
results = []

for k in k_values:
    print(f"  K={k}...", end=" ")
    start = time.time()
    
    # Select top k
    top_k = importance_df.head(k)['feature'].tolist()
    k_indices = [feature_cols.index(f) for f in top_k]
    
    X_train_k = X_train_scaled[:, k_indices]
    X_test_k = X_test_scaled[:, k_indices]
    
    # Train
    gbs_k = GradientBoostingSurvivalAnalysis(**GBS_CONFIG)
    gbs_k.fit(X_train_k, y_train)
    
    # Evaluate
    pred_k = gbs_k.predict(X_test_k)
    c_k = concordance_index_censored(y_test['event'], y_test['time'], pred_k)[0]
    
    elapsed = time.time() - start
    results.append({'k': k, 'c_index': c_k, 'time_s': elapsed})
    
    print(f"C-index: {c_k:.4f} ({elapsed:.1f}s)")

df_results = pd.DataFrame(results)
print("\nüìä Results:")
print(df_results.to_string(index=False))

df_results.to_csv(OUTPUT_PATH / 'top_k_results_balanced.csv', index=False)

best = df_results.loc[df_results['c_index'].idxmax()]
print(f"\nüèÜ Best: k={int(best['k'])} ‚Üí C-index={best['c_index']:.4f}")

## Summary

In [None]:
print("\n" + "="*60)
print("üìù SUMMARY")
print("="*60)

print(f"\n‚úÖ Balancing:")
print(f"   Original: 72,082 (5.5% failure)")
print(f"   Balanced: {len(df_balanced):,} ({(df_balanced['event_observed'] == 1).mean():.1%} failure)")

print(f"\n‚úÖ Performance:")
print(f"   All features: {c_all:.4f}")
print(f"   Best Top-K: {best['c_index']:.4f} (k={int(best['k'])})")

print(f"\n‚úÖ Top 5 Features:")
for i, row in importance_df.head(5).iterrows():
    print(f"   {i+1}. {row['feature']:40s} ({row['importance_pct']:.2f}%)")

print("\n" + "="*60)

if c_all > 0.7:
    print("‚úÖ SUCCESS: C-index > 0.7 (good performance!)")
elif c_all > 0.6:
    print("‚ö†Ô∏è  MODERATE: C-index 0.6-0.7 (needs improvement)")
else:
    print("‚ùå POOR: C-index < 0.6 (model not working well)")

print("="*60)