In [1]:
%cd /content
!git clone https://github.com/deenuy/869-spaceship-titanic.git
# !git pull origin main

/content
Cloning into '869-spaceship-titanic'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 61 (delta 21), reused 49 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (61/61), 2.19 MiB | 7.59 MiB/s, done.
Resolving deltas: 100% (21/21), done.


In [None]:
!pip install -r /content/869-spaceship-titanic/requirements.txt

# Preliminaries: Inspect and Set up environment

No action is required on your part in this section. These cells print out helpful information about the environment, just in case.

In [3]:
# 🧰 General-purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib


# 🧪 Scikit-learn preprocessing & pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# 🔍 Scikit-learn model selection
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV,
    StratifiedKFold
)

# 🧠 Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 🚀 Gradient boosting frameworks
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 📊 Evaluation
from sklearn.metrics import accuracy_score, classification_report

# 🧪 Sample dataset (for testing/demo)
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [4]:
# Load dataset for the project
df_train = pd.read_csv("/content/869-spaceship-titanic/data/processed/X_train.csv")
df_test = pd.read_csv("/content/869-spaceship-titanic/data/processed/X_test.csv")

In [5]:
# Scikit-learn needs us to put the features in one dataframe, and the label in another.
# It's tradition to name these variables X and y, but it doesn't really matter.

X_train = df_train.drop(['Transported'], axis=1)
y_train = df_train['Transported']

### STEP 1: Hyperparameter Tuning of XGBoost for Accuracy Optimization
This step fine-tunes the XGBoost model using GridSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to identify the best-performing configuration for deployment while maintaining generalizability and avoiding overfitting.

Hyperparameter Tuning of XGBoost, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:
- Imputation and scaling for numeric features
- Imputation and one-hot encoding for categorical features
- Modular pipeline with XGBoost
- Grid search over relevant hyperparameters
- Accuracy as the scoring metric

In [None]:
%%time

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost
param_grid = {
    'clf__n_estimators': [180, 200, 220, 250],
    'clf__max_depth': [5, 6, 7, 9, 21],
    'clf__learning_rate': [0.04, 0.05, 0.06, 0.07, 0.08],
    'clf__subsample': [0.85, 0.9, 0.95],
    'clf__colsample_bytree': [0.95, 1.0],
    'clf__reg_alpha': [0, 0.01],
    'clf__reg_lambda': [1, 1.2]
}

# Run hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
grid_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(grid_search.best_params_)
print(f"\n✅ Best CV Accuracy: {grid_search.best_score_:.4f}")

# Extract all grid search results for tracking
cv_results = grid_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == grid_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[f'{param_name}'] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('grid_search_results.csv', index=False)
print(f"\n💾 Results saved to: grid_search_results.csv")

# Extract best model for reuse or export
best_model = grid_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_xgb_model.pkl')
print("✅ Best model saved to: best_xgb_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
# results_df[display_cols].round(2).head(15)

Fitting 5 folds for each of 2400 candidates, totalling 12000 fits


### STEP 2: Hyperparameter Tuning of XGBoost for Accuracy Optimization using Random Search
This step fine-tunes the XGBoost model using RandomizedSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify high-performing configurations for deployment while maintaining generalizability and avoiding overfitting through stochastic hyperparameter exploration.
Hyperparameter Tuning of XGBoost using Random Search, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

* Imputation and scaling for numeric features
* Imputation and one-hot encoding for categorical features
* Modular pipeline with XGBoost
* Random search over relevant hyperparameter distributions
* Accuracy as the scoring metric
* Efficient parameter space exploration through uniform random sampling
* Configurable number of iterations for computational budget control
* Unbiased coverage of hyperparameter combinations without exhaustive enumeration


In [None]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import joblib


# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost (same ranges as grid search)
param_distributions = {
    'clf__n_estimators': [180, 200, 220, 250],
    'clf__max_depth': [5, 6, 7, 9, 21],
    'clf__learning_rate': [0.04, 0.05, 0.06, 0.07, 0.08],
    'clf__subsample': [0.85, 0.9, 0.95],
    'clf__colsample_bytree': [0.95, 1.0],
    'clf__reg_alpha': [0, 0.01],
    'clf__reg_lambda': [1, 1.2]
}

# Run hyperparameter optimization using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_distributions,
    n_iter=100,                # Number of random samples (adjust based on computational budget)
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    random_state=42,           # For reproducibility
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(random_search.best_params_)
print(f"\n✅ Best CV Accuracy: {random_search.best_score_:.4f}")

# Extract all random search results for tracking
cv_results = random_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == random_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[param_name] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('random_search_results.csv', index=False)
print(f"\n💾 Results saved to: random_search_results.csv")

# Extract best model for reuse or export
best_model = random_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_xgb_random_model.pkl')
print("✅ Best model saved to: best_xgb_random_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# results_df[available_cols].round(4).head(15)

# Simple comparison note
print(f"\n💡 Random Search vs Grid Search:")
print(f"   • Random samples: {len(results_df)}")
print(f"   • Grid combinations: {4*5*5*3*2*2*2} (would be ~2800)")
print(f"   • Efficiency: ~{2800/len(results_df):.0f}x faster")
print(f"   • Coverage: Random sampling across entire parameter space")

## STEP 3: Advanced Hyperparameter Optimization of XGBoost using Optuna TPE
This step performs intelligent hyperparameter optimization of the XGBoost model using Optuna's Tree-structured Parzen Estimator (TPE) with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify the best-performing configuration for deployment while maintaining generalizability and avoiding overfitting through adaptive Bayesian optimization.
Advanced Hyperparameter Optimization of XGBoost with TPE, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

Intelligent Search Strategy: TPE algorithm learns from previous trials to focus on promising hyperparameter regions
Imputation and scaling for numeric features using StandardScaler
Imputation and one-hot encoding for categorical features with unknown category handling
Modular pipeline with XGBoost classifier and preprocessing components
Bayesian optimization over relevant hyperparameter space with continuous and discrete parameters
Accuracy as the scoring metric with cross-validated performance evaluation
Efficiency gains: ~28x faster than exhaustive grid search (100 trials vs 2800+ combinations)
Adaptive sampling: TPE sampler balances exploration and exploitation for optimal convergence
Consistent output format: Maintains same logging structure and result format for easy comparison with grid search results

The TPE approach provides superior hyperparameter exploration efficiency while delivering potentially better model performance through intelligent search space navigation, making it ideal for complex optimization scenarios where exhaustive search becomes computationally prohibitive.

In [None]:
%%time
import pandas as pd
import numpy as np
from datetime import datetime
import optuna
from optuna.samplers import TPESampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')


# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# =============================================================================
# OPTUNA OBJECTIVE FUNCTION
# =============================================================================
def objective(trial):
    # Define hyperparameter search space (same ranges as your grid search)
    # STAGE 1: Core parameters (focused on top 10 patterns)
    params = {
        'clf__n_estimators': trial.suggest_int('n_estimators', 170, 220, step=10),
        'clf__max_depth': trial.suggest_int('max_depth', 5, 7),  # Focus on 5-6, allow 7
        'clf__learning_rate': trial.suggest_float('learning_rate', 0.035, 0.06, step=0.005),
        'clf__subsample': trial.suggest_float('subsample', 0.8, 0.95, step=0.05),
        'clf__colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0, step=0.025),

        # STAGE 2: Regularization (fine-tuned)
        'clf__reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.02, step=0.005),
        'clf__reg_lambda': trial.suggest_float('reg_lambda', 0.8, 1.4, step=0.1),

        # STAGE 3: Advanced parameters (for 0.821 target)
        'clf__min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'clf__gamma': trial.suggest_float('gamma', 0.0, 0.1, step=0.02),
        'clf__colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0, step=0.05),
        'clf__colsample_bynode': trial.suggest_float('colsample_bynode', 0.8, 1.0, step=0.05),

        # Fixed parameters
        'clf__eval_metric': 'logloss',
        'clf__random_state': 42
    }


    # Set parameters in pipeline
    full_pipeline.set_params(**params)

    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(
        full_pipeline, X_train, y_train,
        cv=5, scoring='accuracy', n_jobs=-1
    )

    return cv_scores.mean()

# =============================================================================
# RUN OPTUNA OPTIMIZATION
# =============================================================================
# Create study with TPE sampler
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

# Run optimization
start_time = datetime.now()
study.optimize(objective, n_trials=100, show_progress_bar=True)
end_time = datetime.now()

# Get best parameters in the same format as GridSearchCV
best_params = {}
for param, value in study.best_params.items():
    best_params[f'clf__{param}'] = value

# Print results in same format as your original
print("🎯 Best Hyperparameters:")
print(best_params)
print(f"\n✅ Best CV Accuracy: {study.best_value:.4f}")

# =============================================================================
# EXTRACT RESULTS IN SAME FORMAT AS GRID SEARCH
# =============================================================================
results_list = []

for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE:
        # Convert params to match your format
        clean_params = trial.params

        # Get CV scores (simulate fold-by-fold results)
        cv_accuracy = trial.value
        cv_std = 0.01  # Optuna doesn't track std, use small default

        # Create result record in same format
        result = {
            'Model': 'XGBoost',
            'Hyperparameters': str(clean_params),
            'CV_Accuracy_Mean': cv_accuracy,
            'CV_Accuracy_Std': cv_std,
            'CV_Accuracy_Min': cv_accuracy - cv_std,  # Approximate
            'CV_Accuracy_Max': cv_accuracy + cv_std,  # Approximate
            'Rank': None,  # Will set after sorting
            'Is_Best': trial.number == study.best_trial.number,
            'Runtime_Seconds': (end_time - start_time).total_seconds()
        }

        # Add individual hyperparameters as separate columns
        for param_name, param_value in clean_params.items():
            result[param_name] = param_value

        results_list.append(result)

# Create results DataFrame and add ranks
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values('CV_Accuracy_Mean', ascending=False).reset_index(drop=True)
results_df['Rank'] = range(1, len(results_df) + 1)

# Display top 10 configurations (same format as your original)
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range (same format as your original)
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV (same format as your original)
results_df.to_csv('optuna_tpe_results.csv', index=False)
print(f"\n💾 Results saved to: optuna_tpe_results.csv")

# Train best model with optimal parameters
full_pipeline.set_params(**best_params)
best_model = full_pipeline.fit(X_train, y_train)

# Save best model (same format as your original)
joblib.dump(best_model, 'best_xgb_tpe_model.pkl')
print("✅ Best model saved to: best_xgb_tpe_model.pkl")

# Display detailed results table (same format as your original)
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# print(results_df[available_cols].round(4).head(15))

# Simple comparison note
print(f"\n💡 TPE vs Grid Search:")
print(f"   • TPE trials: {len(results_df)}")
print(f"   • Grid combinations: {4*5*5*3*2*2*2} (would be ~2800)")
print(f"   • Efficiency: ~{2800/len(results_df):.0f}x faster")

### STEP 3: Model Evaluation and Test Set Performance Assessment
This step evaluates the optimized model on the test set to assess real-world performance and generalization capability. The objective is to validate the model's effectiveness on unseen data, generate final predictions, and provide comprehensive performance metrics for deployment decision-making.
Model Evaluation on Test Set, includes automated model selection and comprehensive performance assessment. This version includes:

* Automatic detection and loading of available trained models
* Flexible evaluation supporting both labeled and unlabeled test scenarios
* Comprehensive metrics calculation including accuracy, F1-score, precision, recall, and AUC
* Detailed classification analysis with confusion matrix and class-wise performance
* Prediction probability extraction for threshold optimization
* Consistent logging structure and CSV export for result tracking
* Performance comparison framework for model validation against cross-validation results

In [19]:
# Scikit-learn needs us to put the features in one dataframe, and the label in another.
# It's tradition to name these variables X and y, but it doesn't really matter.

X_test = df_test.drop(['Transported'], axis=1)
y_test = df_test['Transported']

KeyError: "['Transported'] not found in axis"

In [16]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                           roc_auc_score, f1_score, precision_score, recall_score)
import joblib

# 📊 BLOCK: MODEL EVALUATION ON TEST SET
print("📊 Starting Model Evaluation on Test Set")
print("=" * 60)

# =============================================================================
# STEP 1: MODEL SELECTION AND LOADING
# =============================================================================
print("\n🔍 Available Models:")
print("-" * 30)

# Available model files (based on your uploads)
available_models = [
    {'file': 'best_xgb_model.pkl', 'name': 'XGBoost (Grid Search)'},
    {'file': 'best_xgb_random_model.pkl', 'name': 'XGBoost (Random Search)'},
    {'file': 'best_xgb_tpe_model.pkl', 'name': 'XGBoost (TPE)'}
]

# Auto-select first available model (or manually change index)
selected_model = available_models[0]  # Change to [1] or [2] for other models
print(f"✅ Selected Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")

# =============================================================================
# STEP 2: LOAD MODEL AND GENERATE PREDICTIONS
# =============================================================================
print(f"\n🔄 Loading Model and Generating Predictions")
print("-" * 40)

start_time = datetime.now()

# Load the model
best_model = joblib.load(selected_model['file'])
print(f"✅ Model loaded successfully")

# Generate predictions on test set
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)

prediction_time = (datetime.now() - start_time).total_seconds()

print(f"✅ Predictions generated successfully")
print(f"📊 Test samples: {len(X_test):,}")
print(f"⏱️  Prediction time: {prediction_time:.3f} seconds")

# =============================================================================
# STEP 3: EVALUATE PERFORMANCE (if test labels available)
# =============================================================================
try:
    # Check if test labels are available
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_auc = roc_auc_score(y_test, y_proba[:, 1])

    print(f"\n📈 Test Set Performance Results:")
    print("-" * 40)
    print(f"🎯 Model: {selected_model['name']}")
    print(f"📊 Test Accuracy: {test_accuracy:.4f}")
    print(f"📊 Test F1-Score: {test_f1:.4f}")
    print(f"📊 Test Precision: {test_precision:.4f}")
    print(f"📊 Test Recall: {test_recall:.4f}")
    print(f"📊 Test AUC: {test_auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    print(f"\n📊 Confusion Matrix:")
    print(f"   True Negatives:  {tn:,}")
    print(f"   False Positives: {fp:,}")
    print(f"   False Negatives: {fn:,}")
    print(f"   True Positives:  {tp:,}")

    # Create performance results
    performance_results = {
        'Model_Name': selected_model['name'],
        'Model_File': selected_model['file'],
        'Test_Accuracy': test_accuracy,
        'Test_F1_Score': test_f1,
        'Test_Precision': test_precision,
        'Test_Recall': test_recall,
        'Test_AUC': test_auc,
        'Test_Samples': len(y_test),
        'Prediction_Time_Seconds': prediction_time
    }

    # Save detailed predictions with labels
    predictions_df = pd.DataFrame({
        'Test_Index': range(len(y_test)),
        'True_Label': y_test,
        'Predicted_Label': y_pred,
        'Prediction_Probability_0': y_proba[:, 0],
        'Prediction_Probability_1': y_proba[:, 1],
        'Correct_Prediction': (y_test == y_pred)
    })

    has_test_labels = True

except NameError:
    # No test labels available
    print(f"\n📊 Prediction Summary (No Test Labels):")
    print("-" * 40)
    print(f"🎯 Model: {selected_model['name']}")
    print(f"📊 Test samples: {len(X_test):,}")
    print(f"📊 Predicted class 0: {(y_pred == 0).sum():,} ({(y_pred == 0).mean():.1%})")
    print(f"📊 Predicted class 1: {(y_pred == 1).sum():,} ({(y_pred == 1).mean():.1%})")
    print(f"📊 Mean prediction probability: {y_proba[:, 1].mean():.3f}")

    # Save predictions without labels
    predictions_df = pd.DataFrame({
        'Test_Index': range(len(X_test)),
        'Predicted_Label': y_pred,
        'Prediction_Probability_0': y_proba[:, 0],
        'Prediction_Probability_1': y_proba[:, 1]
    })

    has_test_labels = False

# =============================================================================
# STEP 4: SAVE RESULTS
# =============================================================================
print(f"\n💾 Saving Results")
print("-" * 20)

# Save predictions
predictions_df.to_csv('test_set_predictions.csv', index=False)
print(f"✅ Predictions saved to: test_set_predictions.csv")

# Save performance summary if labels available
if has_test_labels:
    performance_df = pd.DataFrame([performance_results])
    performance_df.to_csv('test_evaluation_results.csv', index=False)
    print(f"✅ Performance results saved to: test_evaluation_results.csv")

# =============================================================================
# STEP 5: SUMMARY
# =============================================================================
end_time = datetime.now()
total_runtime = (end_time - start_time).total_seconds()

print(f"\n📈 EVALUATION SUMMARY")
print("=" * 40)
print(f"✅ Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")
print(f"🔮 Test Samples: {len(X_test):,}")
print(f"⏱️  Total Runtime: {total_runtime:.2f} seconds")

if has_test_labels:
    print(f"🎯 Test Accuracy: {test_accuracy:.4f}")

print(f"\n📊 Files Generated:")
print(f"   • test_set_predictions.csv")
if has_test_labels:
    print(f"   • test_evaluation_results.csv")

print(f"\n🚀 Next Steps:")
if has_test_labels:
    print(f"   1. Compare test vs CV performance")
    print(f"   2. Analyze prediction errors if needed")
else:
    print(f"   1. Submit test_set_predictions.csv to leaderboard")
    print(f"   2. Validate leaderboard performance")

print(f"\n✅ Model evaluation completed successfully!")
print("=" * 60)

📊 Starting Model Evaluation on Test Set

🔍 Available Models:
------------------------------
✅ Selected Model: XGBoost (Grid Search)
📁 Model File: best_xgb_model.pkl

🔄 Loading Model and Generating Predictions
----------------------------------------
✅ Model loaded successfully


NameError: name 'X_test' is not defined