In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


In [72]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [73]:
spaceship = spaceship.dropna()

Now perform the same as before:

- Feature Scaling
- Feature Selection

In [74]:
#Feature Scaling
features = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
X = spaceship[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled_df = pd.DataFrame(X_scaled, columns=features)

In [75]:
#Feature Selection
target =  spaceship["Transported"]

forest = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=5)
forest.fit(X_scaled, target)

selected_features = [features[i] for i in range(len(features)) if forest.support_[i]]
X_selected = X_scaled_df[selected_features]
print("Selected Features:", selected_features)

Selected Features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


- Perform train test Split

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

- Bagging and Pasting

In [77]:
#Bagging
bagging_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100,  max_samples=1000, bootstrap=True, random_state=0)

bagging_clf.fit(X_train, y_train)
pred = bagging_clf.predict(X_test)
bagging_accuracy = accuracy_score(y_test, pred)

print(f"Bagging Accuracy: {bagging_accuracy}")

Bagging Accuracy: 0.7957639939485628


In [78]:
#Pasting
pasting_model = BaggingClassifier(DecisionTreeClassifier(max_depth=20), n_estimators=100, max_samples=100, bootstrap=False, random_state=0)

pasting_model.fit(X_train, y_train)
pred = pasting_model.predict(X_test)
pasting_accuracy = accuracy_score(y_test, pred)

print(f"Pasting Accuracy: {pasting_accuracy}")

Pasting Accuracy: 0.7791225416036308


- Random Forest

In [79]:
forest = RandomForestClassifier(n_estimators=100, random_state=0)

forest.fit(X_train, y_train)
pred = forest.predict(X_test)

rf_accuracy = accuracy_score(y_test, pred)
print("Classification Report:", classification_report(y_test, pred))
print("Random Forest Accuracy:", rf_accuracy)
print("Confusion Matrix:", confusion_matrix(y_test, pred))


Classification Report:
               precision    recall  f1-score   support

       False       0.83      0.73      0.78       653
        True       0.77      0.86      0.81       669

    accuracy                           0.80      1322
   macro avg       0.80      0.79      0.79      1322
weighted avg       0.80      0.80      0.79      1322

Random Forest Accuracy: 0.7950075642965204

Confusion Matrix:
 [[479 174]
 [ 97 572]]


- Gradient Boosting

In [86]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

gb_clf.fit(X_train, y_train)
pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, pred)

print("Classification Report:", classification_report(y_test, pred))
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Confusion Matrix:", confusion_matrix(y_test, pred))


Classification Report:
               precision    recall  f1-score   support

       False       0.85      0.73      0.79       653
        True       0.77      0.87      0.82       669

    accuracy                           0.80      1322
   macro avg       0.81      0.80      0.80      1322
weighted avg       0.81      0.80      0.80      1322

Gradient Boosting Accuracy: 0.8040847201210287

Confusion Matrix:
 [[479 174]
 [ 85 584]]




- Adaptive Boosting (AdaBoosting)

In [85]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=1, random_state=0)

ada_clf.fit(X_train, y_train)
pred = ada_clf.predict(X_test)
ada_accuracy = accuracy_score(y_test, pred)

print("Classification Report:", classification_report(y_test, pred))
print("AdaBoost Accuracy:", ada_accuracy)
print("Confusion Matrix:", confusion_matrix(y_test, pred))



Classification Report:               precision    recall  f1-score   support

       False       0.84      0.72      0.78       653
        True       0.76      0.87      0.81       669

    accuracy                           0.80      1322
   macro avg       0.80      0.79      0.79      1322
weighted avg       0.80      0.80      0.79      1322

AdaBoost Accuracy: 0.7957639939485628
Confusion Matrix: [[472 181]
 [ 89 580]]


- Which model is the best and why?

Bagging = 0.796
Pasting = 0.779
Random Forests = 0.795
Gradient Boosting = 0.804
AdaBoosting = 0.796

Select Gradient Boosting due to its higher value of accuracy in this specific result set. 
This value might be due to their sequential nature.