# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [1]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
spaceship.shape

(8693, 14)

In [4]:
spaceship.dtypes

Unnamed: 0,0
PassengerId,object
HomePlanet,object
CryoSleep,object
Cabin,object
Destination,object
Age,float64
VIP,object
RoomService,float64
FoodCourt,float64
ShoppingMall,float64


In [5]:
spaceship.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,201
CryoSleep,217
Cabin,199
Destination,182
Age,179
VIP,203
RoomService,181
FoodCourt,183
ShoppingMall,208


In [6]:
spaceship_cleaned = spaceship.dropna()

In [7]:
spaceship_cleaned.shape

(6606, 14)

In [8]:
def transform_cabin(cabin):
    if pd.isna(cabin):
        return 'T'  # Handling missing values by assigning them 'T'
    first_letter = cabin[0]
    if first_letter in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
        return first_letter
    else:
        return 'T'  # Assign 'T' for any other letters (e.g., 'X')

spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].apply(transform_cabin)

spaceship_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spaceship_cleaned['Cabin'] = spaceship_cleaned['Cabin'].apply(transform_cabin)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [9]:
spaceship_cleaned = spaceship_cleaned.drop(columns=['PassengerId', 'Name'])

spaceship_cleaned.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [10]:
# Drop the non-numeric columns for scaling (e.g., 'Name', 'Transported')
spaceship_dummies = pd.get_dummies(spaceship_cleaned, drop_first=True)  # Create dummy variables

# Separate numeric columns (excluding 'Transported', as it's the target)
numeric_cols = spaceship_dummies.select_dtypes(include=['float64', 'int64']).columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Perform the scaling on the numeric columns
spaceship_dummies[numeric_cols] = scaler.fit_transform(spaceship_dummies[numeric_cols])

# Display the first few rows to check the scaling
spaceship_dummies.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,0.695413,-0.345756,-0.285355,-0.309494,-0.273759,-0.269534,False,True,False,False,True,False,False,False,False,False,False,False,True,False
1,-0.336769,-0.176748,-0.279993,-0.266112,0.206165,-0.230494,True,False,False,False,False,False,False,False,True,False,False,False,True,False
2,2.002842,-0.279083,1.845163,-0.309494,5.596357,-0.226058,False,True,False,False,False,False,False,False,False,False,False,False,True,True
3,0.28254,-0.345756,0.479034,0.334285,2.636384,-0.098291,False,True,False,False,False,False,False,False,False,False,False,False,True,False
4,-0.887266,0.124056,-0.24365,-0.04747,0.220152,-0.267759,True,False,False,False,False,False,False,False,True,False,False,False,True,False


In [11]:
# Calculate correlation matrix
correlation_matrix = spaceship_dummies.corr()

# Check the correlation of each feature with the target (Transported)
target_corr = correlation_matrix['Transported'].sort_values(ascending=False)

# Display the correlation of all features with 'Transported'
target_corr

Unnamed: 0,Transported
Transported,1.0
CryoSleep_True,0.462803
HomePlanet_Europa,0.182004
Cabin_B,0.146288
Cabin_C,0.109988
FoodCourt,0.055025
Cabin_G,0.022711
HomePlanet_Mars,0.012357
ShoppingMall,0.011602
Destination_PSO J318.5-22,0.001281


In [12]:
# Initialize the model for feature selection
model = LogisticRegression(max_iter=1000)

# Use RFE for feature selection
selector = RFE(model, n_features_to_select=5)  # Selecting top 5 features
selector = selector.fit(spaceship_dummies.drop('Transported', axis=1), spaceship_dummies['Transported'])

# Get the features that were selected
selected_features = spaceship_dummies.drop('Transported', axis=1).columns[selector.support_]

# Display the selected features
print(f"Selected features: {selected_features}")

Selected features: Index(['Spa', 'VRDeck', 'HomePlanet_Europa', 'CryoSleep_True', 'Cabin_C'], dtype='object')


In [13]:
# Separate the features (X) and target (y)
X = spaceship_dummies.drop('Transported', axis=1)  # Features (all columns except 'Transported')
y = spaceship_dummies['Transported']  # Target (the 'Transported' column)

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (5284, 19)
X_test shape: (1322, 19)
y_train shape: (5284,)
y_test shape: (1322,)


- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [14]:
# Initialize individual classifiers
lr = LogisticRegression(random_state=42)
svc = SVC(random_state=42)
dt = DecisionTreeClassifier(random_state=42)

# Combine the classifiers in a voting ensemble (Pasting is also a form of voting, just without replacement)
voting_model = VotingClassifier(estimators=[('lr', lr), ('svc', svc), ('dt', dt)], voting='hard')

# Fit the Voting model on the training data
voting_model.fit(X_train, y_train)

# Predict on the test data
y_pred_voting = voting_model.predict(X_test)

- Evaluate your model

In [15]:
# Evaluate the model
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy of Pasting (Voting) Classifier: {accuracy_voting:.4f}")

Accuracy of Pasting (Voting) Classifier: 0.8139


**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [16]:
param_grid = {
    # Logistic Regression
    'lr__C': [0.1, 1, 10, 100],
    'lr__solver': ['liblinear', 'lbfgs'],
    'lr__max_iter': [100, 200, 300],
    'lr__penalty': ['l2'],

    # Support Vector Classifier
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto'],
    'svc__degree': [3, 5],
    'svc__coef0': [0.0, 0.1],

    # Decision Tree Classifier
    'dt__max_depth': [5, 10, 15, 20, None],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__max_features': ['sqrt', 'log2', None],
    'dt__criterion': ['gini', 'entropy']
}

- Run Grid Search

In [None]:
# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=voting_model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Accuracy: {grid_search.best_score_:.4f}")

# Predict on the test data with the best model
y_pred_grid = grid_search.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)
print(f"Accuracy of Voting Classifier after Grid Search: {accuracy_grid:.4f}")

- Evaluate your model

In [None]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model after Grid Search: {accuracy:.4f}")

# For more detailed evaluation
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))