In [None]:
%cd /content
!git clone https://github.com/deenuy/869-spaceship-titanic.git
# !git pull origin main

In [None]:
!pip install -r /content/869-spaceship-titanic/requirements.txt

# Preliminaries: Inspect and Set up environment

No action is required on your part in this section. These cells print out helpful information about the environment, just in case.

In [1]:
# 🧰 General-purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib


# 🧪 Scikit-learn preprocessing & pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# 🔍 Scikit-learn model selection
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV,
    StratifiedKFold
)

# 🧠 Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 🚀 Gradient boosting frameworks
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 📊 Evaluation
from sklearn.metrics import accuracy_score, classification_report

# 🧪 Sample dataset (for testing/demo)
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [2]:
# ================================================================
# LOAD TRAINING DATA WITH SELECTED FEATURES
# ================================================================

print("📥 LOADING TRAINING DATA WITH SELECTED FEATURES")
print("=" * 55)

# Load complete processed training dataset
df_processed = pd.read_csv('../data/processed/train_features_engineered.csv')
print(f"   ✅ Training dataset loaded: {df_processed.shape}")

# Load selected features from feature selection phase
best_features = pd.read_csv('../data/processed/best_features_selected.csv')['best_features'].tolist()
print(f"   ✅ Selected features loaded: {len(best_features)} features")

# Filter training data using selected features only
X_train = df_processed[best_features]
y_train = df_processed['Transported']

print(f"\n📊 FILTERED TRAINING DATA SUMMARY:")
print(f"   📊 Original features: {df_processed.shape[1] - 2}")  # Exclude PassengerId and Transported
print(f"   📊 Selected features: {X_train.shape[1]}")
print(f"   📊 Feature reduction: {((df_processed.shape[1] - 2 - X_train.shape[1]) / (df_processed.shape[1] - 2) * 100):.1f}%")
print(f"   📊 Training samples: {X_train.shape[0]}")
print(f"   📊 Target distribution: {y_train.sum()}/{len(y_train)} transported")

print(f"\n🎯 SELECTED FEATURES FOR OPTIMIZATION:")
for i, feature in enumerate(best_features, 1):
    print(f"   {i:2d}. {feature}")

print(f"\n✅ Training data ready for hyperparameter optimization!")

📥 LOADING TRAINING DATA WITH SELECTED FEATURES
   ✅ Training dataset loaded: (8693, 19)
   ✅ Selected features loaded: 6 features

📊 FILTERED TRAINING DATA SUMMARY:
   📊 Original features: 17
   📊 Selected features: 6
   📊 Feature reduction: 64.7%
   📊 Training samples: 8693
   📊 Target distribution: 4378/8693 transported

🎯 SELECTED FEATURES FOR OPTIMIZATION:
    1. HomePlanet
    2. CryoSleep
    3. RoomService
    4. TotalSpend
    5. LuxurySpend
    6. Cabin_HomePlanet

✅ Training data ready for hyperparameter optimization!


### STEP 1: Hyperparameter Tuning of XGBoost for Accuracy Optimization using Grid Search
This step fine-tunes the XGBoost model using GridSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to systematically identify optimal configurations for deployment while maintaining generalizability and avoiding overfitting through complete hyperparameter exploration within defined ranges.

Hyperparameter Tuning of XGBoost using Grid Search, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

* Modular pipeline with XGBoost
* Grid search over refined hyperparameter ranges based on domain expertise
* Accuracy as the scoring metric
* Systematic parameter space exploration through complete enumeration within grid boundaries
* Deterministic and reproducible results with guaranteed optimal discovery
* Computational cost scaling with comprehensive coverage of all parameter combinations

In [4]:
%%time

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost (refined around best random search results)
param_grid = {
    'clf__n_estimators': [150, 200, 250],              # 3 values (was 4)
    'clf__max_depth': [5, 6, 7],                       # 3 values (was 3) ✓
    'clf__learning_rate': [0.03, 0.05, 0.07],          # 3 values (was 4)
    'clf__subsample': [0.9, 1.0],                      # 2 values (was 3)
    'clf__colsample_bytree': [0.9, 1.0],               # 2 values (was 3)
    'clf__reg_alpha': [0.01, 0.02],                    # 2 values (was 3)
    'clf__reg_lambda': [0.8, 1.2],                     # 2 values (was 3)
    'clf__min_child_weight': [1, 3]                    # 2 values (was 3)
}

# Run hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
grid_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(grid_search.best_params_)
print(f"\n✅ Best CV Accuracy: {grid_search.best_score_:.4f}")

# Extract all grid search results for tracking
cv_results = grid_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == grid_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[f'{param_name}'] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('results/xgboost_grid_search_results.csv', index=False)
print(f"\n💾 Results saved to: results/xgboost_grid_search_results.csv")

# Extract best model for reuse or export
best_model = grid_search.best_estimator_

# Save best model
joblib.dump(best_model, '../models/best_xgb_grid_model.pkl')
print("✅ Best model saved to: models/best_xgb_grid_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
# results_df[display_cols].round(2).head(15)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
🎯 Best Hyperparameters:
{'clf__colsample_bytree': 0.9, 'clf__learning_rate': 0.07, 'clf__max_depth': 7, 'clf__min_child_weight': 1, 'clf__n_estimators': 150, 'clf__reg_alpha': 0.01, 'clf__reg_lambda': 0.8, 'clf__subsample': 1.0}

✅ Best CV Accuracy: 0.7997

📊 TOP 10 CONFIGURATIONS:
  Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                                      Hyperparameters  Rank
XGBoost          0.799726         0.010206 {'colsample_bytree': 0.9, 'learning_rate': 0.07, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 150, 'reg_alpha': 0.01, 'reg_lambda': 0.8, 'subsample': 1.0}     1
XGBoost          0.799381         0.008199 {'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 250, 'reg_alpha': 0.01, 'reg_lambda': 0.8, 'subsample': 0.9}     2
XGBoost    

### STEP 2: Hyperparameter Tuning of CatBoost for Accuracy Optimization using Grid Search
This step fine-tunes the CatBoost model using GridSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to systematically identify optimal configurations for deployment while maintaining generalizability and avoiding overfitting through complete hyperparameter exploration within defined ranges.

Hyperparameter Tuning of CatBoost using Grid Search, leveraging CatBoost's native categorical feature handling. This version includes:

* Native categorical feature processing without preprocessing
* Gradient boosting with advanced regularization techniques
* Grid search over refined CatBoost-specific hyperparameter ranges based on literature review
* Accuracy as the scoring metric
* Systematic parameter space exploration through complete enumeration within grid boundaries
* Advanced overfitting prevention through bagging temperature and random strength
* Deterministic and reproducible results with guaranteed optimal discovery
* Computational cost scaling with comprehensive coverage of all parameter combinations

In [5]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import joblib

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', CatBoostClassifier(verbose=0, random_state=42))
])

# Hyperparameter search space for CatBoost (fine-tuned grid based on random search findings)
param_grid = {
    'clf__iterations': [600, 800],                     # 2 values (was 4)
    'clf__depth': [3, 4, 5],                          # 3 values (was 3) ✓
    'clf__learning_rate': [0.025, 0.035],             # 2 values (was 4)
    'clf__l2_leaf_reg': [7, 11],                      # 2 values (was 3)
    'clf__border_count': [128, 254],                  # 2 values (was 3)
    'clf__bagging_temperature': [0.3, 0.7],          # 2 values (was 3)
    'clf__random_strength': [1.5, 2.5],              # 2 values (was 3)
    'clf__od_type': ['Iter']                          # 1 value (was 2)
}

# Run hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
grid_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(grid_search.best_params_)
print(f"\n✅ Best CV Accuracy: {grid_search.best_score_:.4f}")

# Extract all grid search results for tracking
cv_results = grid_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'CatBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == grid_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[f'{param_name}'] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('results/catboost_grid_search_results.csv', index=False)
print(f"\n💾 Results saved to: results/catboost_grid_search_results.csv")

# Extract best model for reuse or export
best_model = grid_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_catboost_grid_model.pkl')
print("✅ Best model saved to: best_catboost_grid_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'iterations', 'depth', 'learning_rate', 'l2_leaf_reg', 'bagging_temperature', 'Rank']
# results_df[display_cols].round(2).head(15)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
🎯 Best Hyperparameters:
{'clf__bagging_temperature': 0.3, 'clf__border_count': 128, 'clf__depth': 4, 'clf__iterations': 800, 'clf__l2_leaf_reg': 7, 'clf__learning_rate': 0.035, 'clf__od_type': 'Iter', 'clf__random_strength': 2.5}

✅ Best CV Accuracy: 0.8016

📊 TOP 10 CONFIGURATIONS:
   Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                                        Hyperparameters  Rank
CatBoost          0.801569         0.016349  {'bagging_temperature': 0.3, 'border_count': 128, 'depth': 4, 'iterations': 800, 'l2_leaf_reg': 7, 'learning_rate': 0.035, 'od_type': 'Iter', 'random_strength': 2.5}     1
CatBoost          0.801569         0.016349  {'bagging_temperature': 0.7, 'border_count': 128, 'depth': 4, 'iterations': 800, 'l2_leaf_reg': 7, 'learning_rate': 0.035, 'od_type': 'Iter', 'random_strength': 2.5}     1
Ca

### STEP 3: Hyperparameter Tuning of Gradient Boosting for Accuracy Optimization using Grid Search
This step fine-tunes the Gradient Boosting model using GridSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to systematically identify optimal configurations for deployment while maintaining generalizability and avoiding overfitting through complete hyperparameter exploration within defined ranges.

Hyperparameter Tuning of Gradient Boosting using Grid Search, utilizing scikit-learn's robust implementation. This version includes:

* Sequential boosting with sample and feature subsampling
* Tree-based weak learners with configurable depth and split criteria
* Grid search over refined gradient boosting hyperparameter ranges based on domain expertise
* Accuracy as the scoring metric
* Systematic parameter space exploration through complete enumeration within grid boundaries
* Overfitting control through subsample ratios and leaf constraints
* Deterministic and reproducible results with guaranteed optimal discovery
* Computational cost scaling with comprehensive coverage of all parameter combinations

In [6]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', GradientBoostingClassifier(random_state=42))
])

# Hyperparameter search space for Gradient Boosting (fine-tuned grid based on random search findings)
param_grid =  {
    'clf__n_estimators': [250, 350],                  # 2 values (was 3)
    'clf__max_depth': [3, 4],                         # 2 values (was 2) ✓
    'clf__learning_rate': [0.06, 0.08],               # 2 values (was 3)
    'clf__subsample': [0.85, 0.95],                   # 2 values (was 3)
    'clf__min_samples_split': [12, 18],               # 2 values (was 3)
    'clf__min_samples_leaf': [4, 8],                  # 2 values (was 3)
    'clf__max_features': ['log2', 0.8],               # 2 values (was 3)
    'clf__validation_fraction': [0.1]                 # 1 value (was 2)
}

# Run hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
grid_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(grid_search.best_params_)
print(f"\n✅ Best CV Accuracy: {grid_search.best_score_:.4f}")

# Extract all grid search results for tracking
cv_results = grid_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'GradientBoosting',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == grid_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[f'{param_name}'] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('gradientboosting_grid_search_results.csv', index=False)
print(f"\n💾 Results saved to: gradientboosting_grid_search_results.csv")

# Extract best model for reuse or export
best_model = grid_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_gradientboosting_grid_model.pkl')
print("✅ Best model saved to: best_gradientboosting_grid_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'min_samples_split', 'Rank']
# results_df[display_cols].round(2).head(15)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
🎯 Best Hyperparameters:
{'clf__learning_rate': 0.08, 'clf__max_depth': 3, 'clf__max_features': 0.8, 'clf__min_samples_leaf': 4, 'clf__min_samples_split': 18, 'clf__n_estimators': 250, 'clf__subsample': 0.85, 'clf__validation_fraction': 0.1}

✅ Best CV Accuracy: 0.7994

📊 TOP 10 CONFIGURATIONS:
           Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                                                     Hyperparameters  Rank
GradientBoosting          0.799382         0.013031    {'learning_rate': 0.08, 'max_depth': 3, 'max_features': 0.8, 'min_samples_leaf': 4, 'min_samples_split': 18, 'n_estimators': 250, 'subsample': 0.85, 'validation_fraction': 0.1}     1
GradientBoosting          0.799267         0.012084    {'learning_rate': 0.06, 'max_depth': 3, 'max_features': 0.8, 'min_samples_leaf': 4, 'min_samples_split': 18, 'n

In [7]:
# ================================================================
# SUMMARY: TOP PERFORMING MODELS COMPARISON
# ================================================================

print("\n" + "="*60)
print("🏆 GRID SEARCH OPTIMIZATION SUMMARY - BEST SELECTED FEATURES")
print("="*60)

# Load results from each model
xgb_results = pd.read_csv('../results/xgboost_grid_search_results.csv')
cat_results = pd.read_csv('../results/catboost_grid_search_results.csv')
gb_results = pd.read_csv('gradientboosting_grid_search_results.csv')

# Extract best performance from each model
summary_data = []

# XGBoost best
xgb_best = xgb_results.loc[xgb_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'XGBoost',
    'CV_Accuracy_Mean': xgb_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': xgb_best['CV_Accuracy_Std']
})

# CatBoost best
cat_best = cat_results.loc[cat_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'CatBoost',
    'CV_Accuracy_Mean': cat_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': cat_best['CV_Accuracy_Std']
})

# Gradient Boosting best
gb_best = gb_results.loc[gb_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'GradientBoosting',
    'CV_Accuracy_Mean': gb_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': gb_best['CV_Accuracy_Std']
})

# Create summary DataFrame
summary_df = pd.DataFrame(summary_data).sort_values('CV_Accuracy_Mean', ascending=False)

# Display summary
print("📊 BEST PERFORMANCE FROM EACH MODEL:")
print(summary_df.round(4).to_string(index=False))

# Save summary
summary_df.to_csv('results/grid_search_summary.csv', index=False)
print(f"\n💾 Summary saved to: results/grid_search_summary.csv")

# Winner announcement
winner = summary_df.iloc[0]
print(f"\n🏆 GRID SEARCH WINNER: {winner['Model']}")
print(f"🏆 Best CV Accuracy: {winner['CV_Accuracy_Mean']:.4f} (±{winner['CV_Accuracy_Std']:.4f})")


🏆 GRID SEARCH OPTIMIZATION SUMMARY - BEST SELECTED FEATURES
📊 BEST PERFORMANCE FROM EACH MODEL:
           Model  CV_Accuracy_Mean  CV_Accuracy_Std
        CatBoost            0.8016           0.0163
         XGBoost            0.7997           0.0102
GradientBoosting            0.7994           0.0130

💾 Summary saved to: results/grid_search_summary.csv

🏆 GRID SEARCH WINNER: CatBoost
🏆 Best CV Accuracy: 0.8016 (±0.0163)
