# LAB | Ensemble Methods

**Load the data**

In this challenge, we will be working with the same Spaceship Titanic data, like the previous Lab. The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

In this Lab, you should try different ensemble methods in order to see if can obtain a better model than before. In order to do a fair comparison, you should perform the same feature scaling, engineering applied in previous Lab.

In [5]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [7]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [9]:
spaceship.shape
spaceship.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [11]:
# dropping null values
spaceship.dropna(inplace=True)

In [13]:
# transforming the column "Cabin" 
spaceship["Cabin"] = spaceship["Cabin"].apply(lambda x: x[0])
spaceship

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [15]:
from sklearn.preprocessing import OneHotEncoder
non_numerical_columns=spaceship.select_dtypes(include=['object', 'category']).columns
new_spaceship = pd.get_dummies(spaceship, columns=non_numerical_columns)
new_spaceship = new_spaceship.astype(int)
new_spaceship

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,PassengerId_0001_01,PassengerId_0002_01,PassengerId_0003_01,...,Name_Zinonon Veriedeezy,Name_Zinopus Spandisket,Name_Zinor Axlentindy,Name_Zinor Proorbeng,Name_Zinoth Lansuffle,Name_Zosmark Trattle,Name_Zosmark Unaasor,Name_Zosmas Ineedeve,Name_Zosmas Mormonized,Name_Zubeneb Pasharne
0,39,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,109,9,25,549,44,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,58,43,3576,0,6715,49,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,33,0,1283,371,3329,193,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16,303,70,151,565,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,41,0,6819,0,1643,74,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8689,18,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8690,26,0,0,1872,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8691,32,0,1049,0,353,3235,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Perform Train Test Split**

In [None]:
# dropping PassengerId and Name
spaceship.drop(["PassengerId", "Name"], axis=1, inplace=True)                

In [None]:
corr=np.abs(new_spaceship.corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

In [None]:
# keeping columns with feature-to-target correlation
df_filtered = new_spaceship.loc[:, ['RoomService', 'Spa', 'VRDeck', 'CryoSleep_False', 'CryoSleep_True', 'Transported']] 
df_filtered

In [None]:
enc = OneHotEncoder()

In [None]:
enc.fit(df_filtered)

In [None]:
# enc.fit(new_spaceship)

In [None]:
enc.transform(df_filtered).toarray()

In [None]:
# selecting features
features = df_filtered.drop("Transported", axis=1)
target = df_filtered["Transported"]

In [None]:
# selecting features
#features = new_spaceship.drop("Transported", axis=1)
#target = new_spaceship["Transported"]

In [None]:
# performing train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

In [None]:
# creating  normalizer
normalizer = MinMaxScaler()

In [None]:
#fit it to training data
normalizer.fit(X_train)

In [None]:
# transforming training and testing data
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)
X_train_norm 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=70)

In [None]:
knn.fit(X_train_norm, y_train)

In [None]:
pred = knn.predict(X_test)
pred

In [None]:
y_test.values

In [None]:
knn.score(X_test_norm, y_test)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_norm, y_train)

In [None]:
# Make predictions
predictions = model.predict(X_test)

In [None]:
y_test.values

In [None]:
# evaluating accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

In [None]:
# decision tree
tree = DecisionTreeRegressor(max_depth=10)

In [None]:
tree.fit(X_train_norm, y_train)

In [None]:
X_train_norm

In [None]:
y_test

In [None]:
pred = tree.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", tree.score(X_test_norm, y_test))

**Model Selection** - now you will try to apply different ensemble methods in order to get a better model

- Bagging and Pasting

In [None]:
#your code here

- Random Forests

In [None]:
#your code here

- Gradient Boosting

In [None]:
#your code here

- Adaptive Boosting

In [None]:
#your code here

Which model is the best and why?

In [None]:
#comment here