# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

Now perform the same as before:
- Feature Scaling
- Feature Selection


- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

- Evaluate your model

**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
import time

# Load dataset
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")

# Data preprocessing
spaceship['CryoSleep'] = spaceship['CryoSleep'].astype(bool)
spaceship['VIP'] = spaceship['VIP'].astype(bool)
spaceship = spaceship.dropna()
spaceship['Cabin'] = spaceship['Cabin'].astype(str)
spaceship['cabin_class'] = spaceship['Cabin'].str[0]
spaceship = spaceship.drop(columns=['PassengerId', 'Name', 'Cabin'])
spaceship = pd.get_dummies(spaceship, columns=['HomePlanet', 'cabin_class', 'Destination'])
spaceship['Transported'] = spaceship['Transported'].astype(int)

# Feature Selection
features = spaceship[['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'cabin_class_A', 'cabin_class_B', 'cabin_class_C', 'cabin_class_D', 'cabin_class_E', 'cabin_class_F', 'cabin_class_G', 'cabin_class_T', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']]
target = spaceship['Transported']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=4)

# Scale the data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to DataFrame for consistency
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Define a function to evaluate and store model results along with computation time
def evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test):
    start_time = time.time()  # Record start time
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    end_time = time.time()  # Record end time
    
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    score = model.score(X_test_scaled, y_test)
    computation_time = end_time - start_time  # Calculate computation time
    
    return accuracy, precision, recall, f1, score, computation_time

# Define hyperparameters for GridSearchCV and RandomizedSearchCV for different models
param_grids = {
    "BaggingClassifier": {
        "n_estimators": [50, 100, 200],
        "max_samples": [0.5, 1.0],
        "max_features": [0.5, 1.0]
    },
    "RandomForestClassifier": {
        "n_estimators": [50, 100, 200],
        "max_depth": [10, 30, None],
        "min_samples_split": [2, 5, 10]
    },
    "GradientBoostingClassifier": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10]
    },
    "AdaBoostClassifier": {
        "n_estimators": [50, 100, 200],
        "estimator__max_depth": [3, 5, 10]
    },
    "XGBClassifier": {
        "n_estimators": [50, 100, 200],
        "max_depth": [3, 5, 10]
    }
}

# Function to get the best model using GridSearchCV and RandomizedSearchCV
def best_model(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1)
    
    grid_search.fit(X_train, y_train)
    random_search.fit(X_train, y_train)
    
    best_grid = grid_search.best_estimator_
    best_random = random_search.best_estimator_
    
    # Compare results and return the best model
    if grid_search.best_score_ > random_search.best_score_:
        return best_grid
    else:
        return best_random

# Initialize results dictionary
results = {}

# KNN (No hyperparameter tuning for KNN in this example)
knn = KNeighborsClassifier(n_neighbors=10)
results['KNN'] = evaluate_model(knn, X_train_scaled, y_train, X_test_scaled, y_test)

# Bagging Classifier
bagging_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=20), random_state=1)
bagging_clf = best_model(bagging_clf, param_grids["BaggingClassifier"], X_train_scaled, y_train)
results['Bagging'] = evaluate_model(bagging_clf, X_train_scaled, y_train, X_test_scaled, y_test)

# Random Forest Classifier
forest_clf = RandomForestClassifier(random_state=1)
forest_clf = best_model(forest_clf, param_grids["RandomForestClassifier"], X_train_scaled, y_train)
results['Random Forest'] = evaluate_model(forest_clf, X_train_scaled, y_train, X_test_scaled, y_test)

# Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=1)
gb_clf = best_model(gb_clf, param_grids["GradientBoostingClassifier"], X_train_scaled, y_train)
results['Gradient Boosting'] = evaluate_model(gb_clf, X_train_scaled, y_train, X_test_scaled, y_test)

# AdaBoost Classifier
ada_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=1), algorithm='SAMME')
ada_clf = best_model(ada_clf, param_grids["AdaBoostClassifier"], X_train_scaled, y_train)
results['AdaBoost'] = evaluate_model(ada_clf, X_train_scaled, y_train, X_test_scaled, y_test)

# XGBoost Classifier
xgb_clf = XGBClassifier(random_state=1)
xgb_clf = best_model(xgb_clf, param_grids["XGBClassifier"], X_train_scaled, y_train)
results['XGBoost'] = evaluate_model(xgb_clf, X_train_scaled, y_train, X_test_scaled, y_test)

# Display results in a table
results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Model Score', 'Time (s)']).T
print(results_df)




                   Accuracy  Precision    Recall  F1 Score  Model Score  \
KNN                0.751620   0.751261  0.694099  0.721550     0.751620   
Bagging            0.778258   0.759259  0.763975  0.761610     0.778258   
Random Forest      0.799856   0.773952  0.802795  0.788110     0.799856   
Gradient Boosting  0.794096   0.744536  0.846273  0.792151     0.794096   
AdaBoost           0.780418   0.739745  0.812112  0.774241     0.780418   
XGBoost            0.793377   0.742857  0.847826  0.791878     0.793377   

                   Time (s)  
KNN                0.026001  
Bagging            0.927663  
Random Forest      0.192249  
Gradient Boosting  0.361823  
AdaBoost           1.236662  
XGBoost            0.068426  
