# LAB | Ensemble Methods

**Load the data**

In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab.

In [1]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split

# Sample DataFrame creation
data = {
    'PassengerId': ['0001_01', '0002_01', '0003_01', '0003_02', '0004_01'],
    'HomePlanet': ['Europa', 'Earth', 'Europa', 'Europa', 'Earth'],
    'CryoSleep': [False, False, False, False, False],
    'Cabin': ['B/0/P', 'F/0/S', 'A/0/S', 'A/0/S', 'F/1/S'],
    'Destination': ['TRAPPIST-1e'] * 5,
    'Age': [39.0, 24.0, 58.0, 33.0, 16.0],
    'VIP': [False, False, True, False, False],
    'RoomService': [0.0, 109.0, 43.0, 0.0, 303.0],
    'FoodCourt': [0.0, 9.0, 3576.0, 1283.0, 70.0],
    'ShoppingMall': [0.0, 25.0, 0.0, 371.0, 151.0],
    'Spa': [0.0, 549.0, 6715.0, 3329.0, 565.0],
    'VRDeck': [0.0, 44.0, 49.0, 193.0, 2.0],
    'Transported': [False, True, False, False, True]
}
    
df = pd.DataFrame(data) 


df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
df['Transported'] = df['Transported'].astype(int)

# One-hot encode categorical columns
df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', 'Cabin'], drop_first=True)

# Drop identifiers
df = df.drop(columns=['PassengerId'])

# Separate features and target
X = df.drop(columns=['Transported'])
y = df['Transported']



In [11]:
# Feature scaling 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_scaled, y)

# Get selected feature names
selected_features = X.columns[selector.get_support()]
print("Top Selected Features:\n", selected_features)

Top Selected Features:
 Index(['Age', 'RoomService', 'HomePlanet_Europa', 'Cabin_F/0/S',
       'Cabin_F/1/S'],
      dtype='object')


  f = msb / msw


**Perform Train Test Split**

In [15]:
 X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=test_size, random_state=42, stratify=y
    )

    return X_train, X_test, y_train, y_test, selected_features

X_train, X_test, y_train, y_test, selected_features = preprocess_and_split(df)

print("Selected Features:", selected_features.tolist())
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

IndentationError: unexpected indent (1328291543.py, line 1)

**Model Selection** - now you will try to apply different ensemble methods in order to get a better model

- Bagging and Pasting

In [19]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


def train_bagging_pasting(X_train, X_test, y_train, y_test):
    # Base estimator
    base_tree = DecisionTreeClassifier(random_state=42)

    # Bagging classifier (with replacement)
    bagging_clf = BaggingClassifier(
        estimator=base_tree,
        n_estimators=10,
        max_samples=0.8,
        bootstrap=True,  # Sampling WITH replacement
        random_state=42
    )
    
    bagging_clf.fit(X_train, y_train)
    y_pred_bagging = bagging_clf.predict(X_test)
    acc_bagging = accuracy_score(y_test, y_pred_bagging)
    
pasting_clf = BaggingClassifier(
        estimator=base_tree,
        n_estimators=10,
        max_samples=0.8,
        bootstrap=False,  # Sampling WITHOUT replacement
        random_state=42
    )
    pasting_clf.fit(X_train, y_train)
    y_pred_pasting = pasting_clf.predict(X_test)
    acc_pasting = accuracy_score(y_test, y_pred_pasting)

    print(f"Bagging Accuracy: {acc_bagging:.2f}")
    print(f"Pasting Accuracy: {acc_pasting:.2f}")

    return bagging_clf, pasting_clf

bagging_model, pasting_model = train_bagging_pasting(X_train, X_test, y_train, y_test)

IndentationError: unexpected indent (1731248463.py, line 30)

- Random Forests

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def train_random_forest(X_train, X_test, y_train, y_test):
    # Initialize Random Forest
    rf_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        random_state=42,
        n_jobs=-1  # Use all CPU cores
    )

    # Fit the model
    rf_clf.fit(X_train, y_train)

    # Predict
    y_pred = rf_clf.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Random Forest Accuracy: {accuracy:.2f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return rf_clf

    

In [21]:
rf_model = train_random_forest(X_train, X_test, y_train, y_test)

NameError: name 'X_train' is not defined

- Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

def train_boosting_models(X_train, X_test, y_train, y_test):
    results = {}

    # Gradient Boosting
    gb_clf = GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.1, random_state=42
    )
    gb_clf.fit(X_train, y_train)
    gb_pred = gb_clf.predict(X_test)
    gb_acc = accuracy_score(y_test, gb_pred)
    results['Gradient Boosting'] = gb_acc

print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")

NameError: name 'gb_acc' is not defined

- Adaptive Boosting

In [33]:
ada_clf = AdaBoostClassifier(
        n_estimators=100, learning_rate=0.1, random_state=42
    )
    ada_clf.fit(X_train, y_train)
    ada_pred = ada_clf.predict(X_test)
    ada_acc = accuracy_score(y_test, ada_pred)
    results['AdaBoost'] = ada_acc

    # Print results
    print(f"AdaBoost Accuracy: {ada_acc:.2f}")

    return gb_clf, ada_clf, results

print(f"AdaBoost Accuracy: {ada_acc:.2f}")

IndentationError: unexpected indent (916596241.py, line 4)

Which model is the best and why?

In [31]:
model_scores = {
    'Bagging': accuracy_score(y_test, bagging_model.predict(X_test)),
    'Pasting': accuracy_score(y_test, pasting_model.predict(X_test)),
    'Random Forest': accuracy_score(y_test, rf_model.predict(X_test)),
    'Gradient Boosting': boost_results['Gradient Boosting'],
    'AdaBoost': boost_results['AdaBoost']
}

# Sort by accuracy
sorted_scores = sorted(model_scores.items(), key=lambda x: x[1], reverse=True)

print("\n✅ Model Performance Summary:")
for name, acc in sorted_scores:
    print(f"{name}: {acc:.2f}")

best_model = sorted_scores[0][0]
print(f"\n🏆 Best performing model: {best_model}")

NameError: name 'y_test' is not defined