# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [270]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [271]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [272]:
spaceship.shape

(8693, 14)

In [273]:
spaceship.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [274]:
spaceship.dropna(inplace = True)
spaceship.head()
spaceship.shape

(6606, 14)

In [None]:


import copy
df_spaceship = copy.deepcopy(spaceship)

df_spaceship[['CryoSleep', 'VIP', 'Transported']] = df_spaceship[['CryoSleep', 'VIP', 'Transported']].astype(int)

df_spaceship['Cabin'] = df_spaceship['Cabin'].str[0]

df_spaceship['PassengerId'] = df_spaceship['PassengerId'].str[:4].astype(int)

df_spaceship.columns = df_spaceship.columns.str.strip().str.lower()

df_spaceship_cleaned = df_spaceship.drop(columns=['passengerid', 'name'])



In [None]:
df_spaceship_cleaned = pd.get_dummies(df_spaceship_cleaned, columns=['homeplanet', 'cabin', 'destination'], dtype=int)

X = df_spaceship_cleaned.drop(columns=["transported"])
y = df_spaceship_cleaned["transported"]
print(X.dtypes)


cryosleep                      int64
age                          float64
vip                            int64
roomservice                  float64
foodcourt                    float64
shoppingmall                 float64
spa                          float64
vrdeck                       float64
homeplanet_Earth               int64
homeplanet_Europa              int64
homeplanet_Mars                int64
cabin_A                        int64
cabin_B                        int64
cabin_C                        int64
cabin_D                        int64
cabin_E                        int64
cabin_F                        int64
cabin_G                        int64
cabin_T                        int64
destination_55 Cancri e        int64
destination_PSO J318.5-22      int64
destination_TRAPPIST-1e        int64
dtype: object


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

y_train = y_train.astype(int)
y_test = y_test.astype(int)


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grids
param_grids = {
    "RandomForest": {
        'n_estimators': [10, 100, 250, 500],
        'max_depth': [1, 5, 10, 25],
        'min_samples_leaf': [1, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    "DecisionTree": {
        'max_depth': [1, 5, 10, 25],
        'min_samples_leaf': [1, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    "GradientBoosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10]
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    },
    "GradientBoostingRegressor": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10]
    }
}

# Initialize best models dictionary
best_models = {}

# Define models
models_dict = {
    "RandomForest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42)
}

# Train models using GridSearchCV
for model_name, model in models_dict.items():
    
    print(f"🔍 Running GridSearchCV for {model_name}...")

    # Select the correct scoring metric
    scoring_metric = "accuracy" if "Regressor" not in model_name else "neg_mean_squared_error"
    
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        scoring=scoring_metric,
        n_jobs=-1, cv=5, verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best {model_name}: {grid_search.best_params_}\n")

# Store best models
models = best_models


🔍 Running GridSearchCV for RandomForest...
Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error

# Evaluate models
for model_name, model in best_models.items():
    print(f"Evaluating {model_name}...")

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    if "Regressor" in model_name:
        train_score = mean_squared_error(y_train, y_train_pred)
        test_score = mean_squared_error(y_test, y_test_pred)
        print(f"{model_name} - Train MSE: {train_score:.4f}, Test MSE: {test_score:.4f}")
    else:
        train_score = accuracy_score(y_train, y_train_pred)
        test_score = accuracy_score(y_test, y_test_pred)
        print(f"{model_name} - Train Accuracy: {train_score:.4f}, Test Accuracy: {test_score:.4f}")


🔎 Evaluating RandomForest...
✅ RandomForest - Train Accuracy: 0.8295, Test Accuracy: 0.8094
🔎 Evaluating DecisionTree...
✅ DecisionTree - Train Accuracy: 0.8159, Test Accuracy: 0.7897
🔎 Evaluating GradientBoosting...
✅ GradientBoosting - Train Accuracy: 0.8223, Test Accuracy: 0.8101
🔎 Evaluating AdaBoost...
✅ AdaBoost - Train Accuracy: 0.7642, Test Accuracy: 0.7481
🔎 Evaluating GradientBoostingRegressor...
🔹 GradientBoostingRegressor - Train MSE: 0.1165, Test MSE: 0.1334
