In [2]:
%cd /content
!git clone https://github.com/deenuy/869-spaceship-titanic.git
# !git pull origin main

/content
Cloning into '869-spaceship-titanic'...
remote: Enumerating objects: 86, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 86 (delta 29), reused 70 (delta 18), pack-reused 0 (from 0)[K
Receiving objects: 100% (86/86), 4.10 MiB | 11.73 MiB/s, done.
Resolving deltas: 100% (29/29), done.


In [3]:
!pip install -r /content/869-spaceship-titanic/requirements.txt

Collecting catboost (from -r /content/869-spaceship-titanic/requirements.txt (line 6))
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting optuna (from -r /content/869-spaceship-titanic/requirements.txt (line 7))
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting loguru==0.7.3 (from -r /content/869-spaceship-titanic/requirements.txt (line 13))
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting gradio==5.12.0 (from -r /content/869-spaceship-titanic/requirements.txt (line 16))
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio==5.12.0->-r /content/869-spaceship-titanic/requirements.txt (line 16))
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting gradio-client==1.5.4 (from gradio==5.12.0->-r /content/869-spaceship-titanic/requirements.txt (line 16))
  Downloading gradio_client-1.5.4-py3-none-any.whl.metadata (7.1 kB)
Collect

# Preliminaries: Inspect and Set up environment

No action is required on your part in this section. These cells print out helpful information about the environment, just in case.

In [4]:
# 🧰 General-purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib


# 🧪 Scikit-learn preprocessing & pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# 🔍 Scikit-learn model selection
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV,
    StratifiedKFold
)

# 🧠 Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 🚀 Gradient boosting frameworks
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 📊 Evaluation
from sklearn.metrics import accuracy_score, classification_report

# 🧪 Sample dataset (for testing/demo)
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [5]:
# Load complete processed dataset
df_processed = pd.read_csv('/content/869-spaceship-titanic/data/processed/train_dataset_spaceship_titanic_processed.csv')
X_train = df_processed.drop(['Transported', 'PassengerId'], axis=1, errors='ignore')
y_train = df_processed['Transported']

### STEP 1: Hyperparameter Tuning of XGBoost for Accuracy Optimization
This step fine-tunes the XGBoost model using GridSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to identify the best-performing configuration for deployment while maintaining generalizability and avoiding overfitting.

Hyperparameter Tuning of XGBoost, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:
- Imputation and scaling for numeric features
- Imputation and one-hot encoding for categorical features
- Modular pipeline with XGBoost
- Grid search over relevant hyperparameters
- Accuracy as the scoring metric

In [6]:
%%time

# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost
param_grid = {
    'clf__n_estimators': [180, 200, 220, 250],
    'clf__max_depth': [5, 6, 7, 9, 21],
    'clf__learning_rate': [0.04, 0.05, 0.06, 0.07, 0.08],
    'clf__subsample': [0.85, 0.9, 0.95],
    'clf__colsample_bytree': [0.95, 1.0],
    'clf__reg_alpha': [0, 0.01],
    'clf__reg_lambda': [1, 1.2]
}

# Run hyperparameter optimization using GridSearchCV
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
grid_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(grid_search.best_params_)
print(f"\n✅ Best CV Accuracy: {grid_search.best_score_:.4f}")

# Extract all grid search results for tracking
cv_results = grid_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == grid_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[f'{param_name}'] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('grid_search_results.csv', index=False)
print(f"\n💾 Results saved to: grid_search_results.csv")

# Extract best model for reuse or export
best_model = grid_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_xgb_model.pkl')
print("✅ Best model saved to: best_xgb_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
# results_df[display_cols].round(2).head(15)

Fitting 5 folds for each of 2400 candidates, totalling 12000 fits
🎯 Best Hyperparameters:
{'clf__colsample_bytree': 0.95, 'clf__learning_rate': 0.04, 'clf__max_depth': 5, 'clf__n_estimators': 180, 'clf__reg_alpha': 0, 'clf__reg_lambda': 1.2, 'clf__subsample': 0.85}

✅ Best CV Accuracy: 0.9624

📊 TOP 10 CONFIGURATIONS:
  Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                 Hyperparameters  Rank
XGBoost          0.962383         0.007715    {'colsample_bytree': 0.95, 'learning_rate': 0.04, 'max_depth': 5, 'n_estimators': 180, 'reg_alpha': 0, 'reg_lambda': 1.2, 'subsample': 0.85}     1
XGBoost          0.962383         0.006649  {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 21, 'n_estimators': 180, 'reg_alpha': 0.01, 'reg_lambda': 1.2, 'subsample': 0.9}     2
XGBoost          0.962153         0.007965    {'colsample_bytree': 0.95, 'learning_rate': 0.04, 'max_

###STEP 2: Hyperparameter Tuning of XGBoost for Accuracy Optimization using Random Search
This step fine-tunes the XGBoost model using RandomizedSearchCV with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify high-performing configurations for deployment while maintaining generalizability and avoiding overfitting through stochastic hyperparameter exploration.
Hyperparameter Tuning of XGBoost using Random Search, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

* Imputation and scaling for numeric features
* Imputation and one-hot encoding for categorical features
* Modular pipeline with XGBoost
* Random search over relevant hyperparameter distributions
* Accuracy as the scoring metric
* Efficient parameter space exploration through uniform random sampling
* Configurable number of iterations for computational budget control
* Unbiased coverage of hyperparameter combinations without exhaustive enumeration


In [7]:
%%time
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import joblib


# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# Hyperparameter search space for XGBoost (same ranges as grid search)
param_distributions = {
    'clf__n_estimators': [180, 200, 220, 250],
    'clf__max_depth': [5, 6, 7, 9, 21],
    'clf__learning_rate': [0.04, 0.05, 0.06, 0.07, 0.08],
    'clf__subsample': [0.85, 0.9, 0.95],
    'clf__colsample_bytree': [0.95, 1.0],
    'clf__reg_alpha': [0, 0.01],
    'clf__reg_lambda': [1, 1.2]
}

# Run hyperparameter optimization using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=full_pipeline,
    param_distributions=param_distributions,
    n_iter=100,                # Number of random samples (adjust based on computational budget)
    scoring='accuracy',        # Main leaderboard metric
    cv=5,                      # 5-fold cross-validation
    n_jobs=-1,                 # Parallel execution
    verbose=1,                 # Print progress
    random_state=42,           # For reproducibility
    return_train_score=True    # Track training performance
)

# Fit the pipeline to training data
start_time = datetime.now()
random_search.fit(X_train, y_train)
end_time = datetime.now()

# Print best result and CV performance
print("🎯 Best Hyperparameters:")
print(random_search.best_params_)
print(f"\n✅ Best CV Accuracy: {random_search.best_score_:.4f}")

# Extract all random search results for tracking
cv_results = random_search.cv_results_
results_list = []

for i in range(len(cv_results['params'])):
    # Extract hyperparameters
    params = cv_results['params'][i]
    clean_params = {k.replace('clf__', ''): v for k, v in params.items()}

    # Get CV scores for each fold
    cv_scores = []
    for fold in range(5):
        cv_scores.append(cv_results[f'split{fold}_test_score'][i])

    # Create result record
    result = {
        'Model': 'XGBoost',
        'Hyperparameters': str(clean_params),
        'CV_Accuracy_Mean': cv_results['mean_test_score'][i],
        'CV_Accuracy_Std': cv_results['std_test_score'][i],
        'CV_Accuracy_Min': min(cv_scores),
        'CV_Accuracy_Max': max(cv_scores),
        'Rank': cv_results['rank_test_score'][i],
        'Is_Best': i == random_search.best_index_,
        'Runtime_Seconds': (end_time - start_time).total_seconds()
    }

    # Add individual hyperparameters as separate columns
    for param_name, param_value in clean_params.items():
        result[param_name] = param_value

    results_list.append(result)

# Create results DataFrame
results_df = pd.DataFrame(results_list)

# Display top 10 configurations
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV
results_df.to_csv('random_search_results.csv', index=False)
print(f"\n💾 Results saved to: random_search_results.csv")

# Extract best model for reuse or export
best_model = random_search.best_estimator_

# Save best model
joblib.dump(best_model, 'best_xgb_random_model.pkl')
print("✅ Best model saved to: best_xgb_random_model.pkl")

# Display detailed results table
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# results_df[available_cols].round(4).head(15)

# Simple comparison note
print(f"\n💡 Random Search vs Grid Search:")
print(f"   • Random samples: {len(results_df)}")
print(f"   • Grid combinations: {4*5*5*3*2*2*2} (would be ~2800)")
print(f"   • Efficiency: ~{2800/len(results_df):.0f}x faster")
print(f"   • Coverage: Random sampling across entire parameter space")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
🎯 Best Hyperparameters:
{'clf__subsample': 0.9, 'clf__reg_lambda': 1, 'clf__reg_alpha': 0, 'clf__n_estimators': 200, 'clf__max_depth': 21, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.95}

✅ Best CV Accuracy: 0.9616

📊 TOP 10 CONFIGURATIONS:
  Model  CV_Accuracy_Mean  CV_Accuracy_Std                                                                                                                                  Hyperparameters  Rank
XGBoost          0.961578         0.006149       {'subsample': 0.9, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 21, 'learning_rate': 0.05, 'colsample_bytree': 0.95}     1
XGBoost          0.961463         0.008570      {'subsample': 0.9, 'reg_lambda': 1.2, 'reg_alpha': 0, 'n_estimators': 180, 'max_depth': 7, 'learning_rate': 0.04, 'colsample_bytree': 0.95}     2
XGBoost          0.961233         0.005182 {'subsample': 0.95, 'reg_lambda': 1.2, 'reg_alpha': 0.01, 'n

## STEP 3: Advanced Hyperparameter Optimization of XGBoost using Optuna TPE
This step performs intelligent hyperparameter optimization of the XGBoost model using Optuna's Tree-structured Parzen Estimator (TPE) with 5-fold cross-validation, targeting improved accuracy aligned with leaderboard evaluation. The objective is to efficiently identify the best-performing configuration for deployment while maintaining generalizability and avoiding overfitting through adaptive Bayesian optimization.
Advanced Hyperparameter Optimization of XGBoost with TPE, includes feature engineering integrated into a pipeline using ColumnTransformer. This version includes:

Intelligent Search Strategy: TPE algorithm learns from previous trials to focus on promising hyperparameter regions
Imputation and scaling for numeric features using StandardScaler
Imputation and one-hot encoding for categorical features with unknown category handling
Modular pipeline with XGBoost classifier and preprocessing components
Bayesian optimization over relevant hyperparameter space with continuous and discrete parameters
Accuracy as the scoring metric with cross-validated performance evaluation
Efficiency gains: ~28x faster than exhaustive grid search (100 trials vs 2800+ combinations)
Adaptive sampling: TPE sampler balances exploration and exploitation for optimal convergence
Consistent output format: Maintains same logging structure and result format for easy comparison with grid search results

The TPE approach provides superior hyperparameter exploration efficiency while delivering potentially better model performance through intelligent search space navigation, making it ideal for complex optimization scenarios where exhaustive search becomes computationally prohibitive.

In [8]:
%%time
import pandas as pd
import numpy as np
from datetime import datetime
import optuna
from optuna.samplers import TPESampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
import joblib
import warnings
warnings.filterwarnings('ignore')


# Pipeline with preprocessing + model
full_pipeline = Pipeline([
    # ('preprocessing', preprocessor),
    ('clf', XGBClassifier(eval_metric='logloss', random_state=42))
])

# =============================================================================
# OPTUNA OBJECTIVE FUNCTION
# =============================================================================
def objective(trial):
    # Define hyperparameter search space (same ranges as your grid search)
    # STAGE 1: Core parameters (focused on top 10 patterns)
    params = {
        'clf__n_estimators': trial.suggest_int('n_estimators', 170, 220, step=10),
        'clf__max_depth': trial.suggest_int('max_depth', 5, 7),  # Focus on 5-6, allow 7
        'clf__learning_rate': trial.suggest_float('learning_rate', 0.035, 0.06, step=0.005),
        'clf__subsample': trial.suggest_float('subsample', 0.8, 0.95, step=0.05),
        'clf__colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0, step=0.025),

        # STAGE 2: Regularization (fine-tuned)
        'clf__reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.02, step=0.005),
        'clf__reg_lambda': trial.suggest_float('reg_lambda', 0.8, 1.4, step=0.1),

        # STAGE 3: Advanced parameters (for 0.821 target)
        'clf__min_child_weight': trial.suggest_int('min_child_weight', 1, 3),
        'clf__gamma': trial.suggest_float('gamma', 0.0, 0.1, step=0.02),
        'clf__colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0, step=0.05),
        'clf__colsample_bynode': trial.suggest_float('colsample_bynode', 0.8, 1.0, step=0.05),

        # Fixed parameters
        'clf__eval_metric': 'logloss',
        'clf__random_state': 42
    }


    # Set parameters in pipeline
    full_pipeline.set_params(**params)

    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(
        full_pipeline, X_train, y_train,
        cv=5, scoring='accuracy', n_jobs=-1
    )

    return cv_scores.mean()

# =============================================================================
# RUN OPTUNA OPTIMIZATION
# =============================================================================
# Create study with TPE sampler
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

# Run optimization
start_time = datetime.now()
study.optimize(objective, n_trials=100, show_progress_bar=True)
end_time = datetime.now()

# Get best parameters in the same format as GridSearchCV
best_params = {}
for param, value in study.best_params.items():
    best_params[f'clf__{param}'] = value

# Print results in same format as your original
print("🎯 Best Hyperparameters:")
print(best_params)
print(f"\n✅ Best CV Accuracy: {study.best_value:.4f}")

# =============================================================================
# EXTRACT RESULTS IN SAME FORMAT AS GRID SEARCH
# =============================================================================
results_list = []

for trial in study.trials:
    if trial.state == optuna.trial.TrialState.COMPLETE:
        # Convert params to match your format
        clean_params = trial.params

        # Get CV scores (simulate fold-by-fold results)
        cv_accuracy = trial.value
        cv_std = 0.01  # Optuna doesn't track std, use small default

        # Create result record in same format
        result = {
            'Model': 'XGBoost',
            'Hyperparameters': str(clean_params),
            'CV_Accuracy_Mean': cv_accuracy,
            'CV_Accuracy_Std': cv_std,
            'CV_Accuracy_Min': cv_accuracy - cv_std,  # Approximate
            'CV_Accuracy_Max': cv_accuracy + cv_std,  # Approximate
            'Rank': None,  # Will set after sorting
            'Is_Best': trial.number == study.best_trial.number,
            'Runtime_Seconds': (end_time - start_time).total_seconds()
        }

        # Add individual hyperparameters as separate columns
        for param_name, param_value in clean_params.items():
            result[param_name] = param_value

        results_list.append(result)

# Create results DataFrame and add ranks
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values('CV_Accuracy_Mean', ascending=False).reset_index(drop=True)
results_df['Rank'] = range(1, len(results_df) + 1)

# Display top 10 configurations (same format as your original)
print("\n📊 TOP 10 CONFIGURATIONS:")
top_configs = results_df.nlargest(10, 'CV_Accuracy_Mean')[
    ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Hyperparameters', 'Rank']
]
print(top_configs.to_string(index=False))

# Show accuracy range (same format as your original)
print(f"\n📈 PERFORMANCE SUMMARY:")
print(f"Total experiments: {len(results_df)}")
print(f"Best CV Accuracy: {results_df['CV_Accuracy_Mean'].max():.4f}")
print(f"Worst CV Accuracy: {results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Accuracy Range: {results_df['CV_Accuracy_Mean'].max() - results_df['CV_Accuracy_Mean'].min():.4f}")
print(f"Total Runtime: {(end_time - start_time).total_seconds():.2f} seconds")

# Save results to CSV (same format as your original)
results_df.to_csv('optuna_tpe_results.csv', index=False)
print(f"\n💾 Results saved to: optuna_tpe_results.csv")

# Train best model with optimal parameters
full_pipeline.set_params(**best_params)
best_model = full_pipeline.fit(X_train, y_train)

# Save best model (same format as your original)
joblib.dump(best_model, 'best_xgb_tpe_model.pkl')
print("✅ Best model saved to: best_xgb_tpe_model.pkl")

# Display detailed results table (same format as your original)
print(f"\n📋 DETAILED RESULTS TABLE:")
display_cols = ['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'Rank']
available_cols = [col for col in display_cols if col in results_df.columns]
# print(results_df[available_cols].round(4).head(15))

# Simple comparison note
print(f"\n💡 TPE vs Grid Search:")
print(f"   • TPE trials: {len(results_df)}")
print(f"   • Grid combinations: {4*5*5*3*2*2*2} (would be ~2800)")
print(f"   • Efficiency: ~{2800/len(results_df):.0f}x faster")

[I 2025-06-04 07:37:06,876] A new study created in memory with name: no-name-388464cd-47fa-4665-8045-f7dd1c398bd1


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-06-04 07:37:08,228] Trial 0 finished with value: 0.9626128662756726 and parameters: {'n_estimators': 190, 'max_depth': 7, 'learning_rate': 0.05500000000000001, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 0.0, 'reg_lambda': 0.8, 'min_child_weight': 3, 'gamma': 0.06, 'colsample_bylevel': 0.9500000000000001, 'colsample_bynode': 0.8}. Best is trial 0 with value: 0.9626128662756726.
[I 2025-06-04 07:37:09,827] Trial 1 finished with value: 0.9614625153273147 and parameters: {'n_estimators': 220, 'max_depth': 7, 'learning_rate': 0.04, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha': 0.005, 'reg_lambda': 1.1, 'min_child_weight': 2, 'gamma': 0.02, 'colsample_bylevel': 0.9500000000000001, 'colsample_bynode': 0.8}. Best is trial 0 with value: 0.9626128662756726.
[I 2025-06-04 07:37:11,020] Trial 2 finished with value: 0.9606579181585915 and parameters: {'n_estimators': 180, 'max_depth': 6, 'learning_rate': 0.045000000000000005, 'subsample': 0.95, 'colsample_bytree': 0.9

### STEP 3: Model Evaluation and Test Set Performance Assessment
This step evaluates the optimized model on the test set to assess real-world performance and generalization capability. The objective is to validate the model's effectiveness on unseen data, generate final predictions, and provide comprehensive performance metrics for deployment decision-making.
Model Evaluation on Test Set, includes automated model selection and comprehensive performance assessment. This version includes:

* Automatic detection and loading of available trained models
* Flexible evaluation supporting both labeled and unlabeled test scenarios
* Comprehensive metrics calculation including accuracy, F1-score, precision, recall, and AUC
* Detailed classification analysis with confusion matrix and class-wise performance
* Prediction probability extraction for threshold optimization
* Consistent logging structure and CSV export for result tracking
* Performance comparison framework for model validation against cross-validation results

In [9]:
%%time
# 📊 BLOCK: MODEL EVALUATION AND COMPETITION SUBMISSION
print("📊 Starting Model Evaluation and Competition Submission")
print("=" * 60)

# =============================================================================
# STEP 1: MODEL SELECTION AND LOADING
# =============================================================================
print("\n🔍 Available Models:")
print("-" * 30)

# Available model files (based on your uploads)
available_models = [
    {'file': 'best_xgb_model.pkl', 'name': 'XGBoost (Grid Search)'},
    {'file': 'best_xgb_random_model.pkl', 'name': 'XGBoost (Random Search)'},
    {'file': 'best_xgb_tpe_model.pkl', 'name': 'XGBoost (TPE)'}
]

# Auto-select first available model (or manually change index)
selected_model = available_models[0]  # Change to [1] or [2] for other models
print(f"✅ Selected Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")

📊 Starting Model Evaluation and Competition Submission

🔍 Available Models:
------------------------------
✅ Selected Model: XGBoost (Grid Search)
📁 Model File: best_xgb_model.pkl
CPU times: user 140 µs, sys: 0 ns, total: 140 µs
Wall time: 138 µs


In [10]:
%%time
# =============================================================================
# STEP 2: LOAD COMPETITION TEST DATA
# =============================================================================
print(f"\n📥 Loading Competition Test Data")
print("-" * 40)

# Load competition test data
X_comp = pd.read_csv('/content/869-spaceship-titanic/data/processed/test_dataset_spaceship_titanic_processed.csv')
print(f"✅ Competition test data loaded successfully")
print(f"📊 Competition samples: {len(X_comp):,}")

# Save PassengerIds for submission
passengerIDs = X_comp["PassengerId"]
print(f"✅ Passenger IDs extracted: {len(passengerIDs):,}")

# =============================================================================
# STEP 3: PREPROCESS COMPETITION DATA
# =============================================================================
print(f"\n🔄 Preprocessing Competition Data")
print("-" * 40)

# Drop PassengerId column
X_comp = X_comp.drop(['PassengerId'], axis=1, errors='ignore')


📥 Loading Competition Test Data
----------------------------------------
✅ Competition test data loaded successfully
📊 Competition samples: 4,277
✅ Passenger IDs extracted: 4,277

🔄 Preprocessing Competition Data
----------------------------------------
CPU times: user 56.2 ms, sys: 0 ns, total: 56.2 ms
Wall time: 56 ms


In [11]:
# =============================================================================
# STEP 4: LOAD MODEL AND GENERATE PREDICTIONS
# =============================================================================
print(f"\n🔄 Loading Model and Generating Predictions")
print("-" * 40)

start_time = datetime.now()

# Load the model
best_model = joblib.load(selected_model['file'])
print(f"✅ Model loaded successfully")

# Generate predictions on competition test set
pred_comp = best_model.predict(X_comp)
pred_proba_comp = best_model.predict_proba(X_comp)

prediction_time = (datetime.now() - start_time).total_seconds()

print(f"✅ Predictions generated successfully")
print(f"📊 Competition samples: {len(X_comp):,}")
print(f"⏱️  Prediction time: {prediction_time:.3f} seconds")



🔄 Loading Model and Generating Predictions
----------------------------------------
✅ Model loaded successfully
✅ Predictions generated successfully
📊 Competition samples: 4,277
⏱️  Prediction time: 0.100 seconds


In [12]:
# =============================================================================
# STEP 4: ANALYZE PREDICTIONS
# =============================================================================
print(f"\n📊 Prediction Analysis:")
print("-" * 40)
print(f"🎯 Model: {selected_model['name']}")
print(f"📊 Total predictions: {len(pred_comp):,}")
print(f"📊 Predicted class 0 (Not Transported): {(pred_comp == 0).sum():,} ({(pred_comp == 0).mean():.1%})")
print(f"📊 Predicted class 1 (Transported): {(pred_comp == 1).sum():,} ({(pred_comp == 1).mean():.1%})")
print(f"📊 Mean prediction probability: {pred_proba_comp[:, 1].mean():.3f}")
print(f"📊 Prediction confidence (max prob): {pred_proba_comp.max(axis=1).mean():.3f}")


📊 Prediction Analysis:
----------------------------------------
🎯 Model: XGBoost (Grid Search)
📊 Total predictions: 4,277
📊 Predicted class 0 (Not Transported): 2,007 (46.9%)
📊 Predicted class 1 (Transported): 2,270 (53.1%)
📊 Mean prediction probability: 0.502
📊 Prediction confidence (max prob): 0.657


In [13]:
# =============================================================================
# STEP 5: CREATE SUBMISSION FILE
# =============================================================================
print(f"\n💾 Creating Submission File")
print("-" * 30)

# Create submission dataframe
my_submission = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Transported': pred_comp.astype(bool)  # Convert to boolean as required by competition
})

# Display first 10 rows as sanity check
print(f"📋 Submission Preview:")
print(my_submission.head(10))

# Save submission file
submission_filename = f'submission_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
my_submission.to_csv(submission_filename, index=False)
print(f"✅ Submission saved to: {submission_filename}")


💾 Creating Submission File
------------------------------
📋 Submission Preview:
   PassengerId  Transported
0            0         True
1            1         True
2            2         True
3            3         True
4            4         True
5            5         True
6            6         True
7            7         True
8            8         True
9            9         True
✅ Submission saved to: submission_xgboost_grid_search.csv


In [14]:
# =============================================================================
# STEP 6: SAVE DETAILED PREDICTIONS (OPTIONAL)
# =============================================================================
print(f"\n💾 Saving Detailed Predictions")
print("-" * 35)

# Create detailed predictions dataframe
detailed_predictions = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Predicted_Label': pred_comp,
    'Prediction_Probability_Not_Transported': pred_proba_comp[:, 0],
    'Prediction_Probability_Transported': pred_proba_comp[:, 1],
    'Prediction_Confidence': pred_proba_comp.max(axis=1)
})

detailed_filename = f'detailed_predictions_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
detailed_predictions.to_csv(detailed_filename, index=False)
print(f"✅ Detailed predictions saved to: {detailed_filename}")


💾 Saving Detailed Predictions
-----------------------------------
✅ Detailed predictions saved to: detailed_predictions_xgboost_grid_search.csv


In [15]:
# =============================================================================
# STEP 7: NOTE ABOUT TEST SET
# =============================================================================
print(f"\n📝 Important Note About Test Set")
print("-" * 40)
print("ℹ️  The competition test set has NO labels (no 'Transported' column)")
print("ℹ️  This is the unlabeled data you need to predict for submission")
print("ℹ️  True performance will only be known after Kaggle submission")
print("ℹ️  Use cross-validation scores from training as performance estimates")


📝 Important Note About Test Set
----------------------------------------
ℹ️  The competition test set has NO labels (no 'Transported' column)
ℹ️  This is the unlabeled data you need to predict for submission
ℹ️  True performance will only be known after Kaggle submission
ℹ️  Use cross-validation scores from training as performance estimates


In [16]:
# =============================================================================
# STEP 8: SUMMARY
# =============================================================================
end_time = datetime.now()
total_runtime = (end_time - start_time).total_seconds()

print(f"\n📈 EVALUATION SUMMARY")
print("=" * 40)
print(f"✅ Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")
print(f"🔮 Competition Samples: {len(X_comp):,}")
print(f"⏱️  Total Runtime: {total_runtime:.2f} seconds")

print(f"\n📊 Files Generated:")
print(f"   • {submission_filename} (Main submission file)")
print(f"   • {detailed_filename} (Detailed predictions)")

print(f"\n🚀 Next Steps:")
print(f"   1. Upload '{submission_filename}' to Kaggle competition")
print(f"   2. Check leaderboard performance")
print(f"   3. Compare with cross-validation scores from training")
print(f"   4. Consider ensemble methods if performance differs significantly")

print(f"\n✅ Model evaluation and submission preparation completed successfully!")
print("=" * 60)


📈 EVALUATION SUMMARY
✅ Model: XGBoost (Grid Search)
📁 Model File: best_xgb_model.pkl
🔮 Competition Samples: 4,277
⏱️  Total Runtime: 0.15 seconds

📊 Files Generated:
   • submission_xgboost_grid_search.csv (Main submission file)
   • detailed_predictions_xgboost_grid_search.csv (Detailed predictions)

🚀 Next Steps:
   1. Upload 'submission_xgboost_grid_search.csv' to Kaggle competition
   2. Check leaderboard performance
   3. Compare with cross-validation scores from training
   4. Consider ensemble methods if performance differs significantly

✅ Model evaluation and submission preparation completed successfully!
