# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [13]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

Now perform the same as before:
- Feature Scaling
- Feature Selection


In [14]:
# 1. Load the data
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [15]:
# 2. Drop missing values 
spaceship_cleaned = spaceship.dropna()
spaceship_cleaned.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [16]:
# 3. Convert to numerical data --> dummify
spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].apply(lambda x: x[0])

spaceship_cleaned_2 = spaceship_cleaned.drop(columns=['PassengerId', 'Name'])

spaceship_cleaned_2 = pd.get_dummies(spaceship_cleaned_2)
boolean_columns = spaceship_cleaned_2.select_dtypes(include=['bool']).columns
spaceship_cleaned_2[boolean_columns] = spaceship_cleaned_2[boolean_columns].astype(int)

spaceship_cleaned_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].apply(lambda x: x[0])


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
0,39.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,1,1,0,0,...,0,0,1,0,0,0,0,1,1,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0,0,1,0,...,0,0,0,0,0,0,0,1,1,0
4,16.0,303.0,70.0,151.0,565.0,2.0,1,1,0,0,...,0,0,1,0,0,0,0,1,1,0


In [17]:
# 4. Feature scaling 
# 4.1 Import and fit normalizer 
normalizer = MinMaxScaler()
normalizer.fit(X_train)

In [20]:
# 4.2 Converting normalisation for numerical columns 
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [21]:
# display
X_train_norm

array([[3.16455696e-01, 0.00000000e+00, 5.61164593e-02, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [4.55696203e-01, 0.00000000e+00, 8.80152953e-02, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [4.30379747e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [3.16455696e-01, 0.00000000e+00, 3.85737765e-03, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.06329114e-01, 0.00000000e+00, 7.04390702e-04, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [3.41772152e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [22]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns, index= X_train.index)
X_train_norm.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
7832,0.316456,0.0,0.056116,0.0,0.02865,0.030094,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5842,0.455696,0.0,0.088015,0.135232,0.124911,4.9e-05,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3928,0.43038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4091,0.468354,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
7679,0.278481,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [25]:
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns, index= X_test.index)
X_test_norm.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
8441,0.367089,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
8058,0.164557,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
320,0.632911,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2548,0.075949,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8027,0.468354,0.024458,0.000101,0.049049,4.5e-05,0.003344,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [29]:
# 5. Perform Train Test Split

# Define target and features
features = spaceship_cleaned_2.drop(columns=['Transported'])  
target = spaceship_cleaned_2['Transported']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_False,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True
7832,-0.267957,-0.345756,0.711389,-0.309494,0.287464,0.273473,-1.083063,1.717147,-0.510811,0.738664,...,-0.244975,-0.339578,-0.695098,-0.652578,-0.017402,1.922263,-0.322689,-1.501395,0.158555,-0.158555
5842,0.488976,-0.345756,1.277978,2.565821,2.173069,-0.268647,-1.083063,1.717147,-0.510811,0.738664,...,-0.244975,-0.339578,-0.695098,-0.652578,-0.017402,-0.520220,-0.322689,0.666047,0.158555,-0.158555
3928,0.351352,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,-1.083063,-0.582361,1.957672,-1.353795,...,-0.244975,-0.339578,1.438646,-0.652578,-0.017402,1.922263,-0.322689,-1.501395,0.158555,-0.158555
4091,0.557788,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,0.923307,-0.582361,-0.510811,-1.353795,...,-0.244975,-0.339578,-0.695098,1.532384,-0.017402,1.922263,-0.322689,-1.501395,0.158555,-0.158555
7679,-0.474393,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,0.923307,-0.582361,-0.510811,-1.353795,...,-0.244975,-0.339578,-0.695098,1.532384,-0.017402,-0.520220,3.098956,-1.501395,0.158555,-0.158555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4984,0.144916,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,-1.083063,1.717147,-0.510811,-1.353795,...,-0.244975,2.944832,-0.695098,-0.652578,-0.017402,1.922263,-0.322689,-1.501395,0.158555,-0.158555
6864,-0.474393,-0.302341,-0.169773,-0.306023,-0.273759,0.058755,0.923307,-0.582361,-0.510811,0.738664,...,-0.244975,-0.339578,1.438646,-0.652578,-0.017402,-0.520220,3.098956,-1.501395,0.158555,-0.158555
6919,-0.267957,-0.345756,-0.216840,-0.309494,1.319870,3.315911,-1.083063,1.717147,-0.510811,0.738664,...,-0.244975,-0.339578,-0.695098,-0.652578,-0.017402,-0.520220,-0.322689,0.666047,0.158555,-0.158555
7137,0.764225,-0.345756,-0.272844,-0.309494,1.040133,0.965541,-1.083063,1.717147,-0.510811,0.738664,...,-0.244975,-0.339578,-0.695098,-0.652578,-0.017402,1.922263,-0.322689,-1.501395,-6.306963,6.306963


- Evaluate your model

In [30]:
# 6. By using the model with the best performance --> random forest 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")

Random Forest Accuracy: 0.8116490166414524


**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [33]:
# Grid Search 
from sklearn.model_selection import GridSearchCV

# 7.1 define the grid with values to consider when train several possible combinations
grid = {"max_leaf_nodes": [250, 500, 1000],
        "max_depth":[10,30,50]}

In [34]:
dt = DecisionTreeRegressor()

In [35]:
model = GridSearchCV(estimator = dt, param_grid = grid, cv=5, verbose=10)

In [36]:
model.fit(X_train_norm, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START max_depth=10, max_leaf_nodes=250............................
[CV 1/5; 1/9] END max_depth=10, max_leaf_nodes=250;, score=0.356 total time=   0.0s
[CV 2/5; 1/9] START max_depth=10, max_leaf_nodes=250............................
[CV 2/5; 1/9] END max_depth=10, max_leaf_nodes=250;, score=0.302 total time=   0.0s
[CV 3/5; 1/9] START max_depth=10, max_leaf_nodes=250............................
[CV 3/5; 1/9] END max_depth=10, max_leaf_nodes=250;, score=0.331 total time=   0.0s
[CV 4/5; 1/9] START max_depth=10, max_leaf_nodes=250............................
[CV 4/5; 1/9] END max_depth=10, max_leaf_nodes=250;, score=0.247 total time=   0.0s
[CV 5/5; 1/9] START max_depth=10, max_leaf_nodes=250............................
[CV 5/5; 1/9] END max_depth=10, max_leaf_nodes=250;, score=0.329 total time=   0.0s
[CV 1/5; 2/9] START max_depth=10, max_leaf_nodes=500............................
[CV 1/5; 2/9] END max_depth=10, ma

In [37]:
# what has been the best combination for the hyperparameters
model.best_params_

{'max_depth': 10, 'max_leaf_nodes': 1000}

In [38]:
best_model = model.best_estimator_

In [None]:
# Let's evaluate this model on the TEST set --> Grid
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
pred = best_model.predict(X_test_norm)

print(f"MAE: {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score:  {best_model.score(X_test_norm, y_test): .2f}")

In [41]:
# random Search 
from sklearn.model_selection import RandomizedSearchCV

In [42]:
grid = {"max_leaf_nodes": [int(x) for x in np.linspace(start = 5, stop = 30, num = 3)],
        "max_depth":[int(x) for x in np.linspace(1, 11, num = 3)]}

In [43]:
dt = DecisionTreeRegressor()

model = RandomizedSearchCV(estimator = dt, param_distributions = grid, n_iter = 5, cv = 5)

In [44]:
model.fit(X_train_norm,y_train)

In [45]:
model.best_params_

{'max_leaf_nodes': 30, 'max_depth': 11}

In [46]:
best_model = model.best_estimator_

In [47]:
# Let's evaluate this model on the TEST set --> Random
pred = best_model.predict(X_test_norm)

print(f"MAE: {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE: {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score:  {best_model.score(X_test_norm, y_test): .2f}")

MAE:  0.27
RMSE:  0.38
R2 score:   0.44


- Evaluate your model

1. Test Accuracy: 81%
   
2. MAE/RMSE: very close --> predict the same errors at all samples

3. R2 score: 44%

Overall, the model is solid. 