In [18]:
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.impute import SimpleImputer
import joblib

In [19]:
%%time
# üìä BLOCK: MODEL EVALUATION AND COMPETITION SUBMISSION
print("üìä Starting Model Evaluation and Competition Submission")
print("=" * 60)

# =============================================================================
# STEP 1: MODEL SELECTION AND LOADING
# =============================================================================
print("\nüîç Available Models:")
print("-" * 30)

# Available model files (based on your uploads)
available_models = [
    {'file': 'models/best_xgb_model.pkl', 'name': 'XGBoost (Grid Search)'},
    {'file': 'models/best_xgb_random_model.pkl', 'name': 'XGBoost (Random Search)'},
    {'file': 'models/best_xgb_tpe_model.pkl', 'name': 'XGBoost (TPE)'}
]

# Auto-select first available model (or manually change index)
selected_model = available_models[0]  # Change to [1] or [2] for other models
print(f"‚úÖ Selected Model: {selected_model['name']}")
print(f"üìÅ Model File: {selected_model['file']}")

üìä Starting Model Evaluation and Competition Submission

üîç Available Models:
------------------------------
‚úÖ Selected Model: XGBoost (Grid Search)
üìÅ Model File: models/best_xgb_model.pkl
CPU times: user 56 Œºs, sys: 16 Œºs, total: 72 Œºs
Wall time: 67.2 Œºs


In [22]:
%%time
# =============================================================================
# STEP 2: LOAD COMPETITION TEST DATA
# =============================================================================
print(f"\nüì• Loading Competition Test Data")
print("-" * 40)

# Load competition test data
X_comp = pd.read_csv('../data/processed/test_dataset_spaceship_titanic_processed.csv')
print(f"‚úÖ Competition test data loaded successfully")
print(f"üìä Competition samples: {len(X_comp):,}")

# Save PassengerIds for submission
passengerIDs = X_comp["PassengerId"]
print(f"‚úÖ Passenger IDs extracted: {len(passengerIDs):,}")

# =============================================================================
# STEP 3: PREPROCESS COMPETITION DATA
# =============================================================================
print(f"\nüîÑ Preprocessing Competition Data")
print("-" * 40)

# Drop PassengerId column
X_comp = X_comp.drop(['PassengerId'], axis=1, errors='ignore')


üì• Loading Competition Test Data
----------------------------------------
‚úÖ Competition test data loaded successfully
üìä Competition samples: 4,277
‚úÖ Passenger IDs extracted: 4,277

üîÑ Preprocessing Competition Data
----------------------------------------
CPU times: user 20.6 ms, sys: 16.7 ms, total: 37.3 ms
Wall time: 41.6 ms


In [None]:
# =============================================================================
# STEP 3: LOAD MODEL AND GENERATE PREDICTIONS
# =============================================================================
print(f"\nüîÑ Loading Model and Generating Predictions")
print("-" * 40)

start_time = datetime.now()

# Load the model
best_model = joblib.load(selected_model['file'])
print(f"‚úÖ Model loaded successfully")

# Generate predictions on competition test set
pred_comp = best_model.predict(X_comp)
pred_proba_comp = best_model.predict_proba(X_comp)

prediction_time = (datetime.now() - start_time).total_seconds()

print(f"‚úÖ Predictions generated successfully")
print(f"üìä Competition samples: {len(X_comp):,}")
print(f"‚è±Ô∏è  Prediction time: {prediction_time:.3f} seconds")


In [None]:
# =============================================================================
# STEP 4: ANALYZE PREDICTIONS
# =============================================================================
print(f"\nüìä Prediction Analysis:")
print("-" * 40)
print(f"üéØ Model: {selected_model['name']}")
print(f"üìä Total predictions: {len(pred_comp):,}")
print(f"üìä Predicted class 0 (Not Transported): {(pred_comp == 0).sum():,} ({(pred_comp == 0).mean():.1%})")
print(f"üìä Predicted class 1 (Transported): {(pred_comp == 1).sum():,} ({(pred_comp == 1).mean():.1%})")
print(f"üìä Mean prediction probability: {pred_proba_comp[:, 1].mean():.3f}")
print(f"üìä Prediction confidence (max prob): {pred_proba_comp.max(axis=1).mean():.3f}")

In [None]:
# =============================================================================
# STEP 5: CREATE SUBMISSION FILE
# =============================================================================
print(f"\nüíæ Creating Submission File")
print("-" * 30)

# Create submission dataframe
my_submission = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Transported': pred_comp.astype(bool)  # Convert to boolean as required by competition
})

# Display first 10 rows as sanity check
print(f"üìã Submission Preview:")
print(my_submission.head(10))

# Save submission file
submission_filename = f'submission_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
my_submission.to_csv(submission_filename, index=False)
print(f"‚úÖ Submission saved to: {submission_filename}")

In [None]:
# =============================================================================
# STEP 6: SAVE DETAILED PREDICTIONS (OPTIONAL)
# =============================================================================
print(f"\nüíæ Saving Detailed Predictions")
print("-" * 35)

# Create detailed predictions dataframe
detailed_predictions = pd.DataFrame({
    'PassengerId': passengerIDs,
    'Predicted_Label': pred_comp,
    'Prediction_Probability_Not_Transported': pred_proba_comp[:, 0],
    'Prediction_Probability_Transported': pred_proba_comp[:, 1],
    'Prediction_Confidence': pred_proba_comp.max(axis=1)
})

detailed_filename = f'detailed_predictions_{selected_model["name"].lower().replace(" ", "_").replace("(", "").replace(")", "")}.csv'
detailed_predictions.to_csv(detailed_filename, index=False)
print(f"‚úÖ Detailed predictions saved to: {detailed_filename}")

In [None]:
# =============================================================================
# STEP 7: NOTE ABOUT TEST SET
# =============================================================================
print(f"\nüìù Important Note About Test Set")
print("-" * 40)
print("‚ÑπÔ∏è  The competition test set has NO labels (no 'Transported' column)")
print("‚ÑπÔ∏è  This is the unlabeled data you need to predict for submission")
print("‚ÑπÔ∏è  True performance will only be known after Kaggle submission")
print("‚ÑπÔ∏è  Use cross-validation scores from training as performance estimates")

In [None]:
# =============================================================================
# STEP 8: SUMMARY
# =============================================================================
end_time = datetime.now()
total_runtime = (end_time - start_time).total_seconds()

print(f"\nüìà EVALUATION SUMMARY")
print("=" * 40)
print(f"‚úÖ Model: {selected_model['name']}")
print(f"üìÅ Model File: {selected_model['file']}")
print(f"üîÆ Competition Samples: {len(X_comp):,}")
print(f"‚è±Ô∏è  Total Runtime: {total_runtime:.2f} seconds")

print(f"\nüìä Files Generated:")
print(f"   ‚Ä¢ {submission_filename} (Main submission file)")
print(f"   ‚Ä¢ {detailed_filename} (Detailed predictions)")

print(f"\nüöÄ Next Steps:")
print(f"   1. Upload '{submission_filename}' to Kaggle competition")
print(f"   2. Check leaderboard performance")
print(f"   3. Compare with cross-validation scores from training")
print(f"   4. Consider ensemble methods if performance differs significantly")

print(f"\n‚úÖ Model evaluation and submission preparation completed successfully!")
print("=" * 60)