In [18]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.impute import SimpleImputer
import joblib

In [19]:
%%time
# 📊 BLOCK: MODEL EVALUATION AND COMPETITION SUBMISSION
print("📊 Starting Model Evaluation and Competition Submission")
print("=" * 60)

# =============================================================================
# STEP 1: MODEL SELECTION AND LOADING
# =============================================================================
print("\n🔍 Available Models:")
print("-" * 30)

# Available model files (based on your uploads)
available_models = [
    {'file': 'models/best_xgb_model.pkl', 'name': 'XGBoost (Grid Search)'},
    {'file': 'models/best_xgb_random_model.pkl', 'name': 'XGBoost (Random Search)'},
    {'file': 'models/best_xgb_tpe_model.pkl', 'name': 'XGBoost (TPE)'}
]

# Auto-select first available model (or manually change index)
selected_model = available_models[0]  # Change to [1] or [2] for other models
print(f"✅ Selected Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")

📊 Starting Model Evaluation and Competition Submission

🔍 Available Models:
------------------------------
✅ Selected Model: XGBoost (Grid Search)
📁 Model File: models/best_xgb_model.pkl
CPU times: user 56 μs, sys: 16 μs, total: 72 μs
Wall time: 67.2 μs


In [22]:
%%time
# =============================================================================
# STEP 2: LOAD COMPETITION TEST DATA
# =============================================================================
print(f"\n📥 Loading Competition Test Data")
print("-" * 40)

# Load competition test data
X_comp = pd.read_csv('../data/processed/test_dataset_spaceship_titanic_processed.csv')
print(f"✅ Competition test data loaded successfully")
print(f"📊 Competition samples: {len(X_comp):,}")

# Save PassengerIds for submission
passengerIDs = X_comp["PassengerId"]
print(f"✅ Passenger IDs extracted: {len(passengerIDs):,}")

# =============================================================================
# STEP 3: PREPROCESS COMPETITION DATA
# =============================================================================
print(f"\n🔄 Preprocessing Competition Data")
print("-" * 40)

# Drop PassengerId column
X_comp = X_comp.drop(['PassengerId'], axis=1, errors='ignore')


📥 Loading Competition Test Data
----------------------------------------
✅ Competition test data loaded successfully
📊 Competition samples: 4,277
✅ Passenger IDs extracted: 4,277

🔄 Preprocessing Competition Data
----------------------------------------
CPU times: user 20.6 ms, sys: 16.7 ms, total: 37.3 ms
Wall time: 41.6 ms


In [None]:
# =============================================================================
# STEP 3: LOAD MODEL AND GENERATE PREDICTIONS
# =============================================================================
print(f"\n🔄 Loading Model and Generating Predictions")
print("-" * 40)

start_time = datetime.now()

# Load the model
best_model = joblib.load(selected_model['file'])
print(f"✅ Model loaded successfully")

# Generate predictions on competition test set
pred_comp = best_model.predict(X_comp)
pred_proba_comp = best_model.predict_proba(X_comp)

prediction_time = (datetime.now() - start_time).total_seconds()

print(f"✅ Predictions generated successfully")
print(f"📊 Competition samples: {len(X_comp):,}")
print(f"⏱️  Prediction time: {prediction_time:.3f} seconds")


In [None]:
# =============================================================================
# STEP 4: ANALYZE PREDICTIONS
# =============================================================================
print(f"\n📊 Prediction Analysis:")
print("-" * 40)
print(f"🎯 Model: {selected_model['name']}")
print(f"📊 Total predictions: {len(pred_comp):,}")
print(f"📊 Predicted class 0 (Not Transported): {(pred_comp == 0).sum():,} ({(pred_comp == 0).mean():.1%})")
print(f"📊 Predicted class 1 (Transported): {(pred_comp == 1).sum():,} ({(pred_comp == 1).mean():.1%})")
print(f"📊 Mean prediction probability: {pred_proba_comp[:, 1].mean():.3f}")
print(f"📊 Prediction confidence (max prob): {pred_proba_comp.max(axis=1).mean():.3f}")

In [None]:
# =============================================================================
# STEP 5: CREATE SUBMISSION FILE
# =============================================================================
print(f"\n💾 Creating Submission File")
print("-" * 30)

# Create submission dataframe
my_submission = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Transported': pred_comp.astype(bool)  # Convert to boolean as required by competition
})

# Display first 10 rows as sanity check
print(f"📋 Submission Preview:")
print(my_submission.head(10))

# Save submission file
submission_filename = f'submission_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
my_submission.to_csv(submission_filename, index=False)
print(f"✅ Submission saved to: {submission_filename}")

In [None]:
# =============================================================================
# STEP 6: SAVE DETAILED PREDICTIONS (OPTIONAL)
# =============================================================================
print(f"\n💾 Saving Detailed Predictions")
print("-" * 35)

# Create detailed predictions dataframe
detailed_predictions = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Predicted_Label': pred_comp,
    'Prediction_Probability_Not_Transported': pred_proba_comp[:, 0],
    'Prediction_Probability_Transported': pred_proba_comp[:, 1],
    'Prediction_Confidence': pred_proba_comp.max(axis=1)
})

detailed_filename = f'detailed_predictions_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
detailed_predictions.to_csv(detailed_filename, index=False)
print(f"✅ Detailed predictions saved to: {detailed_filename}")

In [None]:
# =============================================================================
# STEP 7: NOTE ABOUT TEST SET
# =============================================================================
print(f"\n📝 Important Note About Test Set")
print("-" * 40)
print("ℹ️  The competition test set has NO labels (no 'Transported' column)")
print("ℹ️  This is the unlabeled data you need to predict for submission")
print("ℹ️  True performance will only be known after Kaggle submission")
print("ℹ️  Use cross-validation scores from training as performance estimates")

In [None]:
# =============================================================================
# STEP 8: SUMMARY
# =============================================================================
end_time = datetime.now()
total_runtime = (end_time - start_time).total_seconds()

print(f"\n📈 EVALUATION SUMMARY")
print("=" * 40)
print(f"✅ Model: {selected_model['name']}")
print(f"📁 Model File: {selected_model['file']}")
print(f"🔮 Competition Samples: {len(X_comp):,}")
print(f"⏱️  Total Runtime: {total_runtime:.2f} seconds")

print(f"\n📊 Files Generated:")
print(f"   • {submission_filename} (Main submission file)")
print(f"   • {detailed_filename} (Detailed predictions)")

print(f"\n🚀 Next Steps:")
print(f"   1. Upload '{submission_filename}' to Kaggle competition")
print(f"   2. Check leaderboard performance")
print(f"   3. Compare with cross-validation scores from training")
print(f"   4. Consider ensemble methods if performance differs significantly")

print(f"\n✅ Model evaluation and submission preparation completed successfully!")
print("=" * 60)