# LAB | Ensemble Methods

**Load the data**

In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab.

In [1]:
from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com//data-bootcamp-v4//data//main//spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [3]:
spaceship.shape

(8693, 14)

In [4]:
spaceship.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [5]:
spaceship.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
spaceship.dropna(inplace = True)

In [7]:
spaceship["Cabin"] = spaceship["Cabin"].str.split("/").apply(lambda x: x[0])
spaceship["Cabin"].value_counts()

F    2152
G    1973
E     683
B     628
C     587
D     374
A     207
T       2
Name: Cabin, dtype: int64

In [8]:
spaceship = spaceship.drop(columns = ['PassengerId','Name'])

In [9]:
cat_f = spaceship[["HomePlanet","CryoSleep","Cabin","Destination","VIP"]]
num_f = spaceship.drop(columns=["HomePlanet","CryoSleep","Cabin","Destination","VIP","Transported"])

In [54]:
num_f

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,39.0,0.0,0.0,0.0,0.0,0.0
1,24.0,109.0,9.0,25.0,549.0,44.0
2,58.0,43.0,3576.0,0.0,6715.0,49.0
3,33.0,0.0,1283.0,371.0,3329.0,193.0
4,16.0,303.0,70.0,151.0,565.0,2.0
...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0
8689,18.0,0.0,0.0,0.0,0.0,0.0
8690,26.0,0.0,0.0,1872.0,1.0,0.0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0


In [10]:
cat_f_encoded = pd.get_dummies(cat_f, drop_first = True)
cat_f_encoded

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,1,0,0,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,0,1,1
3,1,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,0,0,0,0,0,0,0,0,0,1
8689,0,0,1,0,0,0,0,0,1,0,1,0,0
8690,0,0,0,0,0,0,0,0,1,0,0,1,0
8691,1,0,0,0,0,0,1,0,0,0,0,0,0


In [11]:
features = pd.concat([num_f, cat_f_encoded] ,axis = 1)
target = spaceship['Transported']

In [12]:
features

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,39.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,1,0,0,0,0,0,0,0,0,0,0,1,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,1,0,0,0,0,0,0,0,0,0,0,1,0
4,16.0,303.0,70.0,151.0,565.0,2.0,0,0,0,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41.0,0.0,6819.0,0.0,1643.0,74.0,1,0,0,0,0,0,0,0,0,0,0,0,1
8689,18.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,1,0,1,0,0
8690,26.0,0.0,0.0,1872.0,1.0,0.0,0,0,0,0,0,0,0,0,1,0,0,1,0
8691,32.0,0.0,1049.0,0.0,353.0,3235.0,1,0,0,0,0,0,1,0,0,0,0,0,0


In [13]:
target

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 6606, dtype: bool

**Perform Train Test Split**

In [14]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [15]:
normalizer = MinMaxScaler()
normalizer.fit(x_train)
x_train_norm = normalizer.transform(x_train)
x_test_norm = normalizer.transform(x_test)

In [16]:
x_train_norm = pd.DataFrame(x_train_norm, columns = x_train.columns)
x_train_norm

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,0.405063,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.050633,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.379747,0.000000,0.007916,0.000000,0.051276,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.215190,0.001310,0.000000,0.046111,0.016378,0.000049,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.329114,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5279,0.670886,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5280,0.455696,0.000000,0.000000,0.000000,0.032355,0.000098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5281,0.455696,0.000000,0.159528,0.000000,0.348893,0.004721,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5282,0.430380,0.000000,0.000134,0.000000,0.030569,0.087480,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [17]:
x_test_norm = pd.DataFrame(x_test_norm, columns = x_test.columns)
x_test_norm

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,0.632911,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.227848,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.189873,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.658228,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.784810,0.000000,0.054775,0.00000,0.077740,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.240506,0.000000,0.000000,0.05468,0.000045,0.001672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1318,0.468354,0.030242,0.115185,0.00000,0.000045,0.008409,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1319,0.544304,0.000202,0.178748,0.00000,0.000312,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1320,0.177215,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


**Model Selection** - now you will try to apply different ensemble methods in order to get a better model

- Bagging and Pasting

In [42]:
bagging_reg = BaggingClassifier(DecisionTreeClassifier(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

In [43]:
bagging_reg.fit(x_train_norm, y_train)

In [44]:
pred = bagging_reg.predict(x_test_norm)

print("accuracy", bagging_reg.score(x_test_norm, y_test))

accuracy 0.7806354009077155


- Random Forests

In [45]:
forest = RandomForestClassifier(n_estimators=100,
                             max_depth=20)

In [46]:
forest.fit(x_train_norm, y_train)

In [47]:
pred = forest.predict(x_test_norm)


print("accuracy", forest.score(x_test_norm, y_test))

accuracy 0.7829046898638427


- Gradient Boosting

In [48]:
gb_reg = GradientBoostingClassifier(max_depth=20,
                                   n_estimators=100)

In [49]:
gb_reg.fit(x_train_norm, y_train)

In [50]:
pred = gb_reg.predict(x_test_norm)

print("Accuracy", gb_reg.score(x_test_norm, y_test))

Accuracy 0.7367624810892587


- Adaptive Boosting

In [51]:
ada_reg = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                            n_estimators=100)

In [52]:
ada_reg.fit(x_train_norm, y_train)



In [53]:
pred = ada_reg.predict(x_test_norm)

print("Accuracy", ada_reg.score(x_test_norm, y_test))

Accuracy 0.7723146747352496


Which model is the best and why?

# PCA

In [30]:
from sklearn.decomposition import PCA

In [31]:
x_train_norm = x_train_norm.drop(columns = ['HomePlanet_Europa','HomePlanet_Mars','CryoSleep_True','Cabin_B','Cabin_C','Cabin_D','Cabin_E','Cabin_F','Cabin_G','Cabin_T','Destination_PSO J318.5-22','Destination_TRAPPIST-1e','VIP_True'])

In [32]:
x_train_norm

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.405063,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.050633,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.379747,0.000000,0.007916,0.000000,0.051276,0.000000
3,0.215190,0.001310,0.000000,0.046111,0.016378,0.000049
4,0.329114,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
5279,0.670886,0.000000,0.000000,0.000000,0.000000,0.000000
5280,0.455696,0.000000,0.000000,0.000000,0.032355,0.000098
5281,0.455696,0.000000,0.159528,0.000000,0.348893,0.004721
5282,0.430380,0.000000,0.000134,0.000000,0.030569,0.087480


In [33]:
x_test_norm = x_test_norm.drop(columns = ['HomePlanet_Europa','HomePlanet_Mars','CryoSleep_True','Cabin_B','Cabin_C','Cabin_D','Cabin_E','Cabin_F','Cabin_G','Cabin_T','Destination_PSO J318.5-22','Destination_TRAPPIST-1e','VIP_True'])

In [34]:
x_test_norm

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0.632911,0.000000,0.000000,0.00000,0.000000,0.000000
1,0.227848,0.000000,0.000000,0.00000,0.000000,0.000000
2,0.189873,0.000000,0.000000,0.00000,0.000000,0.000000
3,0.658228,0.000000,0.000000,0.00000,0.000000,0.000000
4,0.784810,0.000000,0.054775,0.00000,0.077740,0.000000
...,...,...,...,...,...,...
1317,0.240506,0.000000,0.000000,0.05468,0.000045,0.001672
1318,0.468354,0.030242,0.115185,0.00000,0.000045,0.008409
1319,0.544304,0.000202,0.178748,0.00000,0.000312,0.000000
1320,0.177215,0.000000,0.000000,0.00000,0.000000,0.000000


In [35]:
y_train

3432    False
7312     True
2042    False
4999    False
5755     True
        ...  
6518    False
4317    False
2214    False
3468    False
3642    False
Name: Transported, Length: 5284, dtype: bool

In [36]:
y_test

2453     True
1334    False
8272     True
5090    False
4357     True
        ...  
85      False
6816     True
5926     True
3793     True
6640     True
Name: Transported, Length: 1322, dtype: bool

In [70]:
# Perform PCA
pca = PCA(n_components=6)
X_train_pca = pca.fit_transform(x_train_norm)
X_test_pca = pca.transform(x_test_norm)

# Create a DataFrame with the principal components
df_pca = pd.DataFrame(X_train_pca, columns=['PC1', 'PC2', 'PC3','PC4','PC5','PC6'])

# Explained variance
print("Explained variance ratio:", pca.explained_variance_ratio_.cumsum())

Explained variance ratio: [0.68714672 0.7743631  0.85499627 0.90737235 0.9550933  1.        ]


In [71]:
df_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6
0,0.037806,-0.001546,-0.038102,-0.000472,-0.010823,-0.008108
1,-0.315548,-0.009106,-0.011971,-0.002624,-0.006725,-0.007658
2,0.014945,-0.017371,-0.014192,-0.013897,0.021809,-0.035366
3,-0.150344,-0.005450,-0.015355,-0.000294,0.032711,0.016784
4,-0.037913,-0.003166,-0.032503,-0.000934,-0.009945,-0.008012
...,...,...,...,...,...,...
5279,0.302822,0.004125,-0.057701,0.001141,-0.013897,-0.008445
5280,0.089555,-0.008109,-0.030395,-0.005330,0.010377,-0.027004
5281,0.109614,-0.149560,0.163670,-0.153209,0.178345,-0.159342
5282,0.067568,-0.043590,0.013103,0.060838,-0.004654,-0.020586


In [72]:
y_train

3432    False
7312     True
2042    False
4999    False
5755     True
        ...  
6518    False
4317    False
2214    False
3468    False
3642    False
Name: Transported, Length: 5284, dtype: bool

In [73]:
lr = LinearRegression()

In [74]:
lr.fit(df_pca, y_train)

In [75]:
lr.score(X_test_pca, y_test)



0.15333535562630674

In [76]:
X_test_pca = pd.DataFrame(X_test_pca, columns = df_pca.columns)

In [61]:
X_test_pca

Unnamed: 0,PC1,PC2,PC3,PC4
0,0.264963,0.003315,-0.054901,0.000911
1,-0.138871,-0.005326,-0.025037,-0.001548
2,-0.176730,-0.006136,-0.022237,-0.001779
3,0.290202,0.003855,-0.056768,0.001064
4,0.422016,-0.034100,-0.011130,-0.045133
...,...,...,...,...
1317,-0.125632,-0.002194,-0.022599,0.004454
1318,0.107484,-0.027061,0.038017,-0.065811
1319,0.185042,-0.071542,0.041924,-0.111860
1320,-0.189350,-0.006406,-0.021303,-0.001856


In [56]:
y_train

3432    False
7312     True
2042    False
4999    False
5755     True
        ...  
6518    False
4317    False
2214    False
3468    False
3642    False
Name: Transported, Length: 5284, dtype: bool

In [57]:
y_test

2453     True
1334    False
8272     True
5090    False
4357     True
        ...  
85      False
6816     True
5926     True
3793     True
6640     True
Name: Transported, Length: 1322, dtype: bool