In [3]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.impute import SimpleImputer
import joblib

In [7]:
%%time

# Load selected features from feature selection phase
best_features = pd.read_csv('../data/processed/best_features_selected.csv')['best_features'].tolist()

# Load TPE Champion model (pre-trained)
tpe_champion = joblib.load("../models/TPE_Champion_Latest_CatBoost.pkl")

# Prepare test data
X_test_kaggle = pd.read_csv('../data/processed/test_features_engineered.csv')
print(f"📊 Competition samples: {len(X_test_kaggle):,}")

passenger_ids = X_test_kaggle["PassengerId"]
X_test_kaggle = X_test_kaggle[best_features]
print(f"✅ Selected features loaded: {len(best_features)} features")

# Generate predictions with TPE Champion
print(f"🚀 Generating predictions with TPE Champion CatBoost (CV: {tpe_champion['best_score']:.4f})")
preds = tpe_champion['model'].predict(X_test_kaggle)

submission = pd.DataFrame({
    "PassengerId": passenger_ids,
    "Transported": preds.astype(bool)
})

filename = f"TPE_Champion_Submission_{tpe_champion['best_score']:.4f}.csv"
submission.to_csv(f"submissions/{filename}", index=False)
print(f"✅ Submission file created: {filename}")

📊 Competition samples: 4,277
✅ Selected features loaded: 6 features
🚀 Generating predictions with TPE Champion CatBoost (CV: 0.8033)
✅ Submission file created: TPE_Champion_Submission_0.8033.csv
CPU times: user 16.4 ms, sys: 14.8 ms, total: 31.2 ms
Wall time: 26.8 ms


In [8]:
# =============================================================================
# ANALYZE PREDICTIONS
# =============================================================================
print(f"\n📊 Prediction Analysis:")
print("-" * 40)
print(f"🎯 Model: TPE Champion CatBoost (CV: {tpe_champion['best_score']:.4f})")
print(f"📊 Total predictions: {len(preds):,}")
print(f"📊 Predicted class 0 (Not Transported): {(preds == 0).sum():,} ({(preds == 0).mean():.1%})")
print(f"📊 Predicted class 1 (Transported): {(preds == 1).sum():,} ({(preds == 1).mean():.1%})")

# Get prediction probabilities for confidence analysis
pred_proba = tpe_champion['model'].predict_proba(X_test_kaggle)

print(f"📊 Mean prediction probability: {pred_proba[:, 1].mean():.3f}")
print(f"📊 Prediction confidence (max prob): {pred_proba.max(axis=1).mean():.3f}")


📊 Prediction Analysis:
----------------------------------------
🎯 Model: TPE Champion CatBoost (CV: 0.8033)
📊 Total predictions: 4,277
📊 Predicted class 0 (Not Transported): 1,885 (44.1%)
📊 Predicted class 1 (Transported): 2,392 (55.9%)
📊 Mean prediction probability: 0.508
📊 Prediction confidence (max prob): 0.805
