# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [1]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [4]:
def scale_and_select_features(df, target_column='Transported', k=5):
    # Copy to avoid modifying original
    df = df.copy()

    # Convert booleans to integers
    for col in ['CryoSleep', 'VIP', target_column]:
        df[col] = df[col].astype(int)

    # One-hot encode categorical variables
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', 'Cabin'], drop_first=True)

    # Drop unused or identifier columns
    df.drop(columns=['PassengerId', 'Name'], inplace=True)

    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]

 # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Feature selection
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X_scaled, y)

    # Get selected feature names
    selected_features = X.columns[selector.get_support()]

    # Return selected data and feature names
    return X_selected, y.values, selected_features

In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")  
X_selected, y, selected_features = scale_and_select_features(df)

print("Selected Features:", selected_features.tolist())
print("Feature Matrix Shape:", X_selected.shape)

ValueError: cannot convert float NaN to integer

- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def fine_tune_gradient_boosting(X_train, X_test, y_train, y_test):
    # Define model
    gb = GradientBoostingClassifier(random_state=42)

    # Define hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 3, 5],
        'subsample': [0.8, 1.0]
    }

    # Grid search with 5-fold cross-validation
    grid_search = GridSearchCV(
        estimator=gb,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    # Fit
    grid_search.fit(X_train, y_train)

    # Best model
    best_model = grid_search.best_estimator_

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    print(f"\nâœ… Best Parameters: {grid_search.best_params_}")
    print(f"ðŸŽ¯ Accuracy after tuning: {test_acc:.2f}")

    return best_model, grid_search.best_params_

best_gb_model, best_params = fine_tune_gradient_boosting(X_train, X_test, y_train, y_test)

NameError: name 'X_train' is not defined

- Evaluate your model

In [11]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test):
    # Predict
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"ðŸŽ¯ Accuracy: {acc:.2f}")

    # Classification report
    print("\nðŸ“‹ Classification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["False", "True"], yticklabels=["False", "True"])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.show()

evaluate_model(best_gb_model, X_test, y_test)

NameError: name 'best_gb_model' is not defined

**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [13]:
param_grid = {
    'n_estimators': [50, 100, 150],          # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.05, 0.1],      # Step size shrinkage
    'max_depth': [2, 3, 5],                  # Max depth of individual trees
    'subsample': [0.8, 1.0],                 # Fraction of samples used per tree
    'min_samples_split': [2, 5]              # Minimum samples to split an internal node
}

grid_search = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

- Run Grid Search

In [16]:

def run_grid_search(X_train, y_train, X_test, y_test):
    # Define the hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 3, 5],
        'subsample': [0.8, 1.0],
        'min_samples_split': [2, 5]
    }

    # Initialize the model
    gb_clf = GradientBoostingClassifier(random_state=42)

    # Set up the grid search
    grid_search = GridSearchCV(
        estimator=gb_clf,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1
    )

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Output
    print("\nâœ… Best Hyperparameters Found:")
    for param, val in best_params.items():
        print(f"  {param}: {val}")
    print(f"\nðŸŽ¯ Test Accuracy of Tuned Model: {test_accuracy:.2f}")

    return best_model, best_params

In [17]:
best_gb_model, best_gb_params = run_grid_search(X_train, y_train, X_test, y_test)

NameError: name 'X_train' is not defined

- Evaluate your model

In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test):
    # Predict
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f"ðŸŽ¯ Accuracy: {acc:.2f}")

    # Classification Report
    print("\nðŸ“‹ Classification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["False", "True"], yticklabels=["False", "True"])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.show()

    return acc

In [19]:
evaluate_model(best_gb_model, X_test, y_test)

NameError: name 'best_gb_model' is not defined