In [None]:
%cd /content
!git clone https://github.com/deenuy/869-spaceship-titanic.git
# !git pull origin main

In [None]:
!pip install -r /content/869-spaceship-titanic/requirements.txt

# Preliminaries: Inspect and Set up environment

No action is required on your part in this section. These cells print out helpful information about the environment, just in case.

In [2]:
# 🧰 General-purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib


# 🧪 Scikit-learn preprocessing & pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# 🔍 Scikit-learn model selection
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV,
    StratifiedKFold
)

# 🧠 Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 🚀 Gradient boosting frameworks
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 📊 Evaluation
from sklearn.metrics import accuracy_score, classification_report

# 🧪 Sample dataset (for testing/demo)
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [3]:
# ================================================================
# LOAD TRAINING DATA WITH SELECTED FEATURES
# ================================================================

print("📥 LOADING TRAINING DATA WITH SELECTED FEATURES")
print("=" * 55)

# Load complete processed training dataset
df_processed = pd.read_csv('../data/processed/train_features_engineered.csv')
print(f"   ✅ Training dataset loaded: {df_processed.shape}")

# Load selected features from feature selection phase
best_features = pd.read_csv('../data/processed/best_features_selected.csv')['best_features'].tolist()
print(f"   ✅ Selected features loaded: {len(best_features)} features")

# Filter training data using selected features only
X_train = df_processed[best_features]
y_train = df_processed['Transported']

print(f"\n📊 FILTERED TRAINING DATA SUMMARY:")
print(f"   📊 Original features: {df_processed.shape[1] - 2}")  # Exclude PassengerId and Transported
print(f"   📊 Selected features: {X_train.shape[1]}")
print(f"   📊 Feature reduction: {((df_processed.shape[1] - 2 - X_train.shape[1]) / (df_processed.shape[1] - 2) * 100):.1f}%")
print(f"   📊 Training samples: {X_train.shape[0]}")
print(f"   📊 Target distribution: {y_train.sum()}/{len(y_train)} transported")

print(f"\n🎯 SELECTED FEATURES FOR OPTIMIZATION:")
for i, feature in enumerate(best_features, 1):
    print(f"   {i:2d}. {feature}")

print(f"\n✅ Training data ready for hyperparameter optimization!")

📥 LOADING TRAINING DATA WITH SELECTED FEATURES
   ✅ Training dataset loaded: (8693, 19)
   ✅ Selected features loaded: 6 features

📊 FILTERED TRAINING DATA SUMMARY:
   📊 Original features: 17
   📊 Selected features: 6
   📊 Feature reduction: 64.7%
   📊 Training samples: 8693
   📊 Target distribution: 4378/8693 transported

🎯 SELECTED FEATURES FOR OPTIMIZATION:
    1. HomePlanet
    2. CryoSleep
    3. RoomService
    4. TotalSpend
    5. LuxurySpend
    6. Cabin_HomePlanet

✅ Training data ready for hyperparameter optimization!


### STEP 1: Hyperparameter Tuning of XGBoost for Accuracy Optimization using Random Search
This step fine-tunes the XGBoost model using RandomizedSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify high-performing configurations for deployment while maintaining generalizability and avoiding overfitting through stochastic hyperparameter exploration.
Hyperparameter Tuning of XGBoost using Random Search, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

* Imputation and scaling for numeric features
* Imputation and one-hot encoding for categorical features
* Modular pipeline with XGBoost
* Random search over relevant hyperparameter distributions
* Accuracy as the scoring metric
* Efficient parameter space exploration through uniform random sampling
* Configurable number of iterations for computational budget control
* Unbiased coverage of hyperparameter combinations without exhaustive enumeration


In [4]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import joblib


# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost (same ranges as grid search)
param_distributions = {
    'clf__n_estimators': [180, 200, 220, 250],
    'clf__max_depth': [5, 6, 7, 9, 21],
    'clf__learning_rate': [0.04, 0.05, 0.06, 0.07, 0.08],
    'clf__subsample': [0.85, 0.9, 0.95],
    'clf__colsample_bytree': [0.95, 1.0],
    'clf__reg_alpha': [0, 0.01],
    'clf__reg_lambda': [1, 1.2]
}

# Run hyperparameter optimization using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_distributions,
    n_iter=100,                # Number of random samples (adjust based on computational budget)
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    random_state=42,           # For reproducibility
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(random_search.best_params_)
print(f"\n✅ Best CV Accuracy: {random_search.best_score_:.4f}")

# Extract all random search results for tracking
cv_results = random_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == random_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[param_name] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('results/xgboost_random_search_results.csv', index=False)
print(f"\n💾 Results saved to: results/xgboost_random_search_results.csv")

# Extract best model for reuse or export
best_model = random_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_xgb_random_model.pkl')
print("✅ Best model saved to: best_xgb_random_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# results_df[available_cols].round(4).head(15)

# Simple comparison note
print(f"\n💡 Random Search vs Grid Search:")
print(f"   • Random samples: {len(results_df)}")
print(f"   • Grid combinations: {4*5*5*3*2*2*2} (would be ~2800)")
print(f"   • Efficiency: ~{2800/len(results_df):.0f}x faster")
print(f"   • Coverage: Random sampling across entire parameter space")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
🎯 Best Hyperparameters:
{'clf__subsample': 0.95, 'clf__reg_lambda': 1, 'clf__reg_alpha': 0.01, 'clf__n_estimators': 180, 'clf__max_depth': 6, 'clf__learning_rate': 0.04, 'clf__colsample_bytree': 0.95}

✅ Best CV Accuracy: 0.7985

📊 TOP 10 CONFIGURATIONS:
  Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                Hyperparameters  Rank
XGBoost          0.798461         0.011299  {'subsample': 0.95, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 180, 'max_depth': 6, 'learning_rate': 0.04, 'colsample_bytree': 0.95}     1
XGBoost          0.798461         0.008987 {'subsample': 0.85, 'reg_lambda': 1.2, 'reg_alpha': 0.01, 'n_estimators': 220, 'max_depth': 6, 'learning_rate': 0.05, 'colsample_bytree': 1.0}     2
XGBoost          0.798347         0.011286     {'subsample': 0.9, 'reg_lambda': 1.2, 'reg_alpha': 0, 'n_es

### STEP 2: Hyperparameter Tuning of CatBoost for Accuracy Optimization using Random Search
This step fine-tunes the CatBoost model using RandomizedSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify high-performing configurations for deployment while maintaining generalizability and avoiding overfitting through stochastic hyperparameter exploration.

Hyperparameter Tuning of CatBoost using Random Search, leveraging CatBoost's native categorical feature handling. This version includes:

* Native categorical feature processing without preprocessing
* Gradient boosting with advanced regularization techniques
* Random search over CatBoost-specific hyperparameter distributions
* Accuracy as the scoring metric
* Efficient parameter space exploration through uniform random sampling
* Advanced overfitting prevention through bagging temperature and random strength
* Configurable number of iterations for computational budget control
* Unbiased coverage of hyperparameter combinations without exhaustive enumeration

In [5]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
import joblib

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', CatBoostClassifier(verbose=0, random_state=42))
])

# Hyperparameter search space for CatBoost (optimized for Spaceship Titanic)
param_distributions = {
    'clf__iterations': [500, 750, 1000, 1250, 1500],
    'clf__depth': [4, 5, 6, 7, 8],
    'clf__learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'clf__l2_leaf_reg': [1, 3, 5, 7, 9],
    'clf__border_count': [128, 254],
    'clf__bagging_temperature': [0, 0.5, 1],
    'clf__random_strength': [0, 1, 2]
}

# Run hyperparameter optimization using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_distributions,
    n_iter=100,                # Number of random samples (adjust based on computational budget)
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    random_state=42,           # For reproducibility
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(random_search.best_params_)
print(f"\n✅ Best CV Accuracy: {random_search.best_score_:.4f}")

# Extract all random search results for tracking
cv_results = random_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'CatBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == random_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[param_name] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('results/catboost_random_search_results.csv', index=False)
print(f"\n💾 Results saved to: results/catboost_random_search_results.csv")

# Extract best model for reuse or export
best_model = random_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_catboost_random_model.pkl')
print("✅ Best model saved to: best_catboost_random_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'iterations', 'depth', 'learning_rate', 'l2_leaf_reg', 'bagging_temperature', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# results_df[available_cols].round(4).head(15)

# Simple comparison note
print(f"\n💡 Random Search vs Grid Search:")
print(f"   • Random samples: {len(results_df)}")
print(f"   • Grid combinations: {5*5*5*5*2*3*3} (would be ~5625)")
print(f"   • Efficiency: ~{5625/len(results_df):.0f}x faster")
print(f"   • Coverage: Random sampling across entire parameter space")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
🎯 Best Hyperparameters:
{'clf__random_strength': 2, 'clf__learning_rate': 0.03, 'clf__l2_leaf_reg': 9, 'clf__iterations': 750, 'clf__depth': 4, 'clf__border_count': 128, 'clf__bagging_temperature': 0.5}

✅ Best CV Accuracy: 0.8009

📊 TOP 10 CONFIGURATIONS:
   Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                 Hyperparameters  Rank
CatBoost          0.800878         0.013055 {'random_strength': 2, 'learning_rate': 0.03, 'l2_leaf_reg': 9, 'iterations': 750, 'depth': 4, 'border_count': 128, 'bagging_temperature': 0.5}     1
CatBoost          0.800877         0.012536   {'random_strength': 0, 'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 500, 'depth': 6, 'border_count': 254, 'bagging_temperature': 1}     2
CatBoost          0.800876         0.008733 {'random_strength': 0, 'learning_rate': 0.03, 'l2_leaf

### STEP 3: Hyperparameter Tuning of Gradient Boosting for Accuracy Optimization using Random Search
This step fine-tunes the Gradient Boosting model using RandomizedSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify high-performing configurations for deployment while maintaining generalizability and avoiding overfitting through stochastic hyperparameter exploration.

Hyperparameter Tuning of Gradient Boosting using Random Search, utilizing scikit-learn's robust implementation. This version includes:

* Sequential boosting with sample and feature subsampling
* Tree-based weak learners with configurable depth and split criteria
* Random search over gradient boosting hyperparameter distributions
* Accuracy as the scoring metric
* Efficient parameter space exploration through uniform random sampling
* Overfitting control through subsample ratios and leaf constraints
* Configurable number of iterations for computational budget control
* Unbiased coverage of hyperparameter combinations without exhaustive enumeration

In [6]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import joblib

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', GradientBoostingClassifier(random_state=42))
])

# Hyperparameter search space for Gradient Boosting (optimized for Spaceship Titanic)
param_distributions = {
    'clf__n_estimators': [150, 200, 250, 300, 350],
    'clf__max_depth': [3, 4, 5, 6, 7],
    'clf__learning_rate': [0.05, 0.07, 0.1, 0.12, 0.15],
    'clf__subsample': [0.8, 0.85, 0.9, 0.95, 1.0],
    'clf__min_samples_split': [2, 5, 10, 15],
    'clf__min_samples_leaf': [1, 2, 4, 6],
    'clf__max_features': ['sqrt', 'log2', 0.8, 1.0]
}

# Run hyperparameter optimization using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_distributions,
    n_iter=100,                # Number of random samples (adjust based on computational budget)
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    random_state=42,           # For reproducibility
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(random_search.best_params_)
print(f"\n✅ Best CV Accuracy: {random_search.best_score_:.4f}")

# Extract all random search results for tracking
cv_results = random_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'GradientBoosting',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == random_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[param_name] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('results/gradientboosting_random_search_results.csv', index=False)
print(f"\n💾 Results saved to: results/gradientboosting_random_search_results.csv")

# Extract best model for reuse or export
best_model = random_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_gradientboosting_random_model.pkl')
print("✅ Best model saved to: best_gradientboosting_random_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'min_samples_split', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# results_df[available_cols].round(4).head(15)

# Simple comparison note
print(f"\n💡 Random Search vs Grid Search:")
print(f"   • Random samples: {len(results_df)}")
print(f"   • Grid combinations: {5*5*5*5*4*4*4} (would be ~8000)")
print(f"   • Efficiency: ~{8000/len(results_df):.0f}x faster")
print(f"   • Coverage: Random sampling across entire parameter space")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
🎯 Best Hyperparameters:
{'clf__subsample': 0.9, 'clf__n_estimators': 300, 'clf__min_samples_split': 15, 'clf__min_samples_leaf': 6, 'clf__max_features': 'log2', 'clf__max_depth': 3, 'clf__learning_rate': 0.07}

✅ Best CV Accuracy: 0.7998

📊 TOP 10 CONFIGURATIONS:
           Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                         Hyperparameters  Rank
GradientBoosting          0.799842         0.012974  {'subsample': 0.9, 'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'log2', 'max_depth': 3, 'learning_rate': 0.07}     1
GradientBoosting          0.799842         0.012974  {'subsample': 0.9, 'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.07}     1
GradientBoosting          0.799037         

In [9]:
# ================================================================
# SUMMARY: TOP PERFORMING MODELS COMPARISON
# ================================================================

print("\n" + "="*60)
print("🏆 RANDOM SEARCH OPTIMIZATION SUMMARY - BEST SELECTED FEATURES")
print("="*60)

# Load results from each model
xgb_results = pd.read_csv('../results/xgboost_random_search_results.csv')
cat_results = pd.read_csv('../results/catboost_random_search_results.csv')
gb_results = pd.read_csv('../results/gradientboosting_random_search_results.csv')

# Extract best performance from each model
summary_data = []

# XGBoost best
xgb_best = xgb_results.loc[xgb_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'XGBoost',
    'CV_Accuracy_Mean': xgb_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': xgb_best['CV_Accuracy_Std']
})

# CatBoost best
cat_best = cat_results.loc[cat_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'CatBoost',
    'CV_Accuracy_Mean': cat_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': cat_best['CV_Accuracy_Std']
})

# Gradient Boosting best
gb_best = gb_results.loc[gb_results['CV_Accuracy_Mean'].idxmax()]
summary_data.append({
    'Model': 'GradientBoosting',
    'CV_Accuracy_Mean': gb_best['CV_Accuracy_Mean'],
    'CV_Accuracy_Std': gb_best['CV_Accuracy_Std']
})

# Create summary DataFrame
summary_df = pd.DataFrame(summary_data).sort_values('CV_Accuracy_Mean', ascending=False)

# Display summary
print("📊 BEST PERFORMANCE FROM EACH MODEL:")
print(summary_df.round(4).to_string(index=False))

# Save summary
summary_df.to_csv('results/random_search_summary.csv', index=False)
print(f"\n💾 Summary saved to: results/random_search_summary.csv")

# Winner announcement
winner = summary_df.iloc[0]
print(f"\n🏆 RANDOM SEARCH WINNER: {winner['Model']}")
print(f"🏆 Best CV Accuracy: {winner['CV_Accuracy_Mean']:.4f} (±{winner['CV_Accuracy_Std']:.4f})")


🏆 RANDOM SEARCH OPTIMIZATION SUMMARY - BEST SELECTED FEATURES
📊 BEST PERFORMANCE FROM EACH MODEL:
           Model  CV_Accuracy_Mean  CV_Accuracy_Std
        CatBoost            0.8009           0.0131
GradientBoosting            0.7998           0.0130
         XGBoost            0.7985           0.0113

💾 Summary saved to: random_search_summary.csv

🏆 RANDOM SEARCH WINNER: CatBoost
🏆 Best CV Accuracy: 0.8009 (±0.0131)
