<a href="https://colab.research.google.com/github/stepthom/869_course/blob/main/2026%20869%20Project%20Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MMAI 869 Project: Example Notebook

*Updated May 1, 2025*

This notebook serves as a template for the Team Project. Teams can use this notebook as a starting point, and update it successively with new ideas and techniques to improve their model results.

Note that is not required to use this template. Teams may also alter this template in any way they see fit.

# Preliminaries: Inspect and Set up environment

No action is required on your part in this section. These cells print out helpful information about the environment, just in case.

In [8]:
# 🧰 General-purpose libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib


# 🧪 Scikit-learn preprocessing & pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# 🔍 Scikit-learn model selection
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_validate,
    GridSearchCV,
    StratifiedKFold
)

# 🧠 Scikit-learn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 🚀 Gradient boosting frameworks
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# 📊 Evaluation
from sklearn.metrics import accuracy_score, classification_report

# 🧪 Sample dataset (for testing/demo)
from sklearn.datasets import make_classification

import warnings
warnings.filterwarnings('ignore', category=UserWarning)


In [9]:
!python --version

Python 3.10.16


# 0: Data Loading and Inspection

In [None]:
# Load complete processed dataset
df_processed = pd.read_csv('../data/processed/train_dataset_spaceship_titanic_processed.csv')
X_train = df_processed.drop(['Transported', 'PassengerId'], axis=1, errors='ignore')
y_train = df_processed['Transported']

In [None]:
df_processed.info()

In [12]:
df_processed.head(10)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,SoloTraveler,TotalSpend,LuxurySpend,BasicSpend,Cabin_HomePlanet,Cabin_Destination,Cabin_CryoSleep,PassengerId,Transported
0,1,0,3,39.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,2,7,3,0001_01,False
1,0,0,3,24.0,0,109.0,9.0,25.0,549.0,44.0,1,736.0,593.0,118.0,13,23,15,0002_01,True
2,1,0,3,58.0,2,43.0,3576.0,0.0,6715.0,49.0,0,10383.0,6764.0,3619.0,0,3,0,0003_01,False
3,1,0,3,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,5176.0,3522.0,1283.0,0,3,0,0003_02,False
4,0,0,3,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1091.0,567.0,373.0,13,23,15,0004_01,True
5,0,0,2,44.0,0,0.0,483.0,0.0,291.0,0.0,1,774.0,291.0,483.0,13,22,15,0005_01,True
6,0,0,3,26.0,0,42.0,1539.0,3.0,0.0,0.0,1,1584.0,0.0,1581.0,13,23,15,0006_01,True
7,0,2,3,28.0,0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,16,27,20,0006_02,True
8,0,0,3,35.0,0,0.0,785.0,17.0,216.0,0.0,1,1018.0,216.0,785.0,13,23,15,0007_01,True
9,1,2,0,14.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,2,4,5,0008_01,True


## 1.2: Model creation, hyperparameter tuning, and validation

### STEP 1: Baseline Model Experimentation
Evaluate both Tree-based and non-Tree ML models using K-fold CV to compare F1-macro, weighted F1, and accuracy scores. No feature engineering is applied. This serves as a baseline using raw numeric data for leaderboard benchmarking.

In [16]:
%%time

# Define short descriptions
model_descriptions = {
    "Decision Tree": "A simple, interpretable tree that splits data based on feature thresholds.",
    "Random Forest": "An ensemble of decision trees that improves generalization via bagging.",
    "Gradient Boosting": "A sequential ensemble where each tree corrects errors from the last.",
    "AdaBoost": "A boosting method that emphasizes misclassified examples during training.",
    "Logistic Regression": "A linear model that predicts probabilities for classification tasks.",
    "SVM": "A margin-based classifier that finds the optimal boundary between classes using support vectors.",
    "XGBoost": "A scalable, regularized boosting method with tree-based learners.",
    "LightGBM": "A fast, efficient gradient boosting method based on histogram-based learning.",
    "CatBoost": "A gradient boosting library with native support for categorical features."
}

# Define model types
model_types = {
    "Decision Tree": "Tree-Based",
    "Random Forest": "Tree-Based",
    "Gradient Boosting": "Tree-Based",
    "AdaBoost": "Tree-Based",
    "XGBoost": "Tree-Based",
    "CatBoost": "Tree-Based",
    "Logistic Regression": "Non-Tree",
    "SVM": "Non-Tree"
}

# Define models
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=3, random_state=0),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=0),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=0),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=0),
    "Logistic Regression": LogisticRegression(max_iter=5000, random_state=0),
    "SVM": SVC(kernel="rbf", C=1.0, probability=True, random_state=0),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=0),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=0)
}

separator = "-" * 60
results = {}

print("\n📊 Starting model evaluation with 5-fold cross-validation\n")

for name, model in models.items():
    print(separator)
    print(f"🔍 Model: {name}")
    print(f"🧠 Description: {model_descriptions.get(name, 'N/A')}")
    print(f"⚙️  Params: {model.get_params()}")

    cv_result = cross_validate(
        model, X_train, y_train,
        cv=5,
        scoring=["f1_macro", "f1_weighted", "accuracy"],
        return_train_score=True,
        n_jobs=-1
    )

    # Calculate means and standard deviations
    results[name] = {
        "Model Type": model_types[name],
        "Train F1 (Macro)": np.mean(cv_result["train_f1_macro"]),
        "Train F1 Std": np.std(cv_result["train_f1_macro"]),
        "CV F1 (Macro)": np.mean(cv_result["test_f1_macro"]),
        "CV F1 Std": np.std(cv_result["test_f1_macro"]),
        "CV F1 (Weighted)": np.mean(cv_result["test_f1_weighted"]),
        "CV F1 (W) Std": np.std(cv_result["test_f1_weighted"]),
        "CV Accuracy": np.mean(cv_result["test_accuracy"]),
        "CV Accuracy Std": np.std(cv_result["test_accuracy"])
    }

    # Print results with standard deviations
    print(f"✅ Train F1 (Macro): {results[name]['Train F1 (Macro)']:.4f} (±{results[name]['Train F1 Std']:.4f})")
    print(f"✅ CV F1 (Macro): {results[name]['CV F1 (Macro)']:.4f} (±{results[name]['CV F1 Std']:.4f})")
    print(f"✅ CV F1 (Weighted): {results[name]['CV F1 (Weighted)']:.4f} (±{results[name]['CV F1 (W) Std']:.4f})")
    print(f"✅ CV Accuracy: {results[name]['CV Accuracy']:.4f} (±{results[name]['CV Accuracy Std']:.4f})")

# Final summary
print("\n✅ All models evaluated. Summary below:\n")

summary_df = pd.DataFrame(results).T.sort_values("CV Accuracy", ascending=False)
display_columns = ["Model Type", "CV Accuracy", "CV Accuracy Std", "CV F1 (Macro)", "CV F1 (Weighted)"]
display(summary_df[display_columns].round(4))



📊 Starting model evaluation with 5-fold cross-validation

------------------------------------------------------------
🔍 Model: Decision Tree
🧠 Description: A simple, interpretable tree that splits data based on feature thresholds.
⚙️  Params: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 0, 'splitter': 'best'}
✅ Train F1 (Macro): 0.7679 (±0.0014)
✅ CV F1 (Macro): 0.7574 (±0.0149)
✅ CV F1 (Weighted): 0.7575 (±0.0148)
✅ CV Accuracy: 0.7594 (±0.0139)
------------------------------------------------------------
🔍 Model: Random Forest
🧠 Description: An ensemble of decision trees that improves generalization via bagging.
⚙️  Params: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_no

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Train F1 (Macro): 0.7853 (±0.0015)
✅ CV F1 (Macro): 0.7854 (±0.0052)
✅ CV F1 (Weighted): 0.7854 (±0.0052)
✅ CV Accuracy: 0.7857 (±0.0053)
------------------------------------------------------------
🔍 Model: SVM
🧠 Description: A margin-based classifier that finds the optimal boundary between classes using support vectors.
⚙️  Params: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': 0, 'shrinking': True, 'tol': 0.001, 'verbose': False}
✅ Train F1 (Macro): 0.7881 (±0.0030)
✅ CV F1 (Macro): 0.7847 (±0.0142)
✅ CV F1 (Weighted): 0.7849 (±0.0142)
✅ CV Accuracy: 0.7868 (±0.0134)
------------------------------------------------------------
🔍 Model: XGBoost
🧠 Description: A scalable, regularized boosting method with tree-based learners.
⚙️  Params: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'call

Unnamed: 0,Model Type,CV Accuracy,CV Accuracy Std,CV F1 (Macro),CV F1 (Weighted)
CatBoost,Tree-Based,0.805363,0.007201,0.805001,0.805042
Gradient Boosting,Tree-Based,0.79869,0.006831,0.798131,0.798193
XGBoost,Tree-Based,0.796045,0.009502,0.795722,0.79576
Random Forest,Tree-Based,0.791098,0.008009,0.790922,0.790907
SVM,Non-Tree,0.786843,0.01335,0.784734,0.784884
Logistic Regression,Non-Tree,0.78569,0.005255,0.785377,0.785425
AdaBoost,Tree-Based,0.781321,0.011132,0.780722,0.780793
Decision Tree,Tree-Based,0.75935,0.013936,0.757415,0.757542


CPU times: user 70.7 ms, sys: 70.9 ms, total: 142 ms
Wall time: 9.79 s


### Summary of Baseline Model Results
After engineering 17 high-impact features from the raw spaceship data, we evaluated eight machine learning algorithms using a structured performance framework. From basic decision trees to advanced ensemble models like CatBoost, each model was tested through 5-fold cross-validation.
CatBoost led the pack, achieving 80.54% accuracy with remarkable consistency—just 0.7% standard deviation. Ensemble tree-based models dominated overall, with Gradient Boosting and XGBoost close behind at 79.8% and 79.6%, respectively.

In [20]:
# Models to test
submission_models = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=0),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=0),
    "XGBoost": XGBClassifier(eval_metric='logloss', use_label_encoder=False, random_state=0)
}

# Prepare test data
X_test_kaggle = pd.read_csv('../data/processed/test_dataset_spaceship_titanic_processed.csv')
passenger_ids = X_test_kaggle["PassengerId"]
X_test_kaggle = X_test_kaggle.drop("PassengerId", axis=1)

# Generate and save predictions for each model
for name, model in submission_models.items():
    print(f"\n🚀 Training and predicting with: {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test_kaggle)

    submission = pd.DataFrame({
        "PassengerId": passenger_ids,
        "Transported": preds.astype(bool)
    })

    submission.to_csv(f"submission_{name.lower()}.csv", index=False)
    print(f"✅ Submission file created: submission_{name.lower()}.csv")


🚀 Training and predicting with: CatBoost
✅ Submission file created: submission_catboost.csv

🚀 Training and predicting with: GradientBoosting
✅ Submission file created: submission_gradientboosting.csv

🚀 Training and predicting with: XGBoost
✅ Submission file created: submission_xgboost.csv
