# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [27]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [34]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [36]:
df.dropna(inplace=True)

In [38]:
df['CryoSleep'] = df['CryoSleep'].fillna(False).astype(bool)
df['VIP'] = df['VIP'].fillna(False).astype(bool)
df['Transported'] = df['Transported'].astype(bool)
df

  df['CryoSleep'] = df['CryoSleep'].fillna(False).astype(bool)
  df['VIP'] = df['VIP'].fillna(False).astype(bool)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [40]:
#Train test
import pandas as pd
from sklearn.model_selection import train_test_split

# Supongamos este es tu DataFrame transformado
# Se usa solo las columnas seleccionadas para X, quitando PassengerId, Cabin, Name y la columna de objetivo
columns_to_use = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
                  'HomePlanet_Europa', 'HomePlanet_Earth', # Dummies
                  'Destination_TRAPPIST-1e', 'Destination_55_Cancri_e', # Dummies
                  'Deck_B', 'Deck_F', 'Deck_A'] # También dummies de Deck si fueran necesarias

# Asegúrate de que estas variables existen en tu DataFrame modificado

# Definir X (características) e y (objetivo)
X = df[columns_to_use]
y = df['Transported']

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificar la forma de X_train para asegurar que todo está correcto
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

KeyError: "['HomePlanet_Europa', 'HomePlanet_Earth', 'Destination_TRAPPIST-1e', 'Destination_55_Cancri_e', 'Deck_B', 'Deck_F', 'Deck_A'] not in index"

In [42]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Variables numéricas que necesitan ser escaladas
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Crear una instancia de escalador
scaler = StandardScaler()

# Escalar las características numéricas.
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

NameError: name 'X_train' is not defined

In [44]:
#Feature selection
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier

# Usaremos SelectKBest para seleccionar características basadas en ANOVA F-value
selector = SelectKBest(score_func=f_classif, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Evaluar impacto de Feature Selection ejecutando el modelo nuevamente
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_selected, y_train)
y_pred_selected = clf.predict(X_test_selected)

# Evaluar el modelo con características seleccionadas
print(f"Accuracy after Feature Selection: {accuracy_score(y_test, y_pred_selected):.2f}")
print("Classification Report after Feature Selection:")
print(classification_report(y_test, y_pred_selected))

NameError: name 'X_train' is not defined

- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Supongamos que ya tienes tu X_train, X_test, y_train, y y_test definidos

# Crear el modelo base de Random Forest
rf = RandomForestClassifier(random_state=42)

# Definir el grid de hiperparámetros a explorar
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Realizar el ajuste fino de hiperparámetros
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo con los mejores hiperparámetros
best_model = grid_search.best_estimator_

# Evaluar el modelo ajustado en el conjunto de prueba
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best parameters found:", grid_search.best_params_)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'X_train' is not defined

- Evaluate your model

In [48]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Ya tienes y_test y y_pred del mejor modelo que ajustaste

# Calcular la exactitud
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Mostrar la matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Reporte de clasificación
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calcular y graficar la curva ROC y AUC
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilidades de la clase positiva

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

NameError: name 'y_test' is not defined

**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Definir la configuración del grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Diferentes cantidades de árboles
    'max_depth': [None, 10, 20],  # Profundidades variadas; None indica sin restricción
    'min_samples_split': [2, 5, 10],  # Tamaños mínimos de división
    'min_samples_leaf': [1, 2, 4],  # Tamaños mínimos de hojas
    'max_features': ['sqrt', 'log2']  # Número de características a considerar
}

# Crear el modelo base de Random Forest
rf = RandomForestClassifier(random_state=42)

# Configurar el GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Posteriormente ejecutarás grid_search.fit(X_train, y_train) para realizar la búsqueda

- Run Grid Search

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Supongamos que ya tienes X_train, X_test, y_train, y y_test definidos

# Definir el grid de hiperparámetros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Crear el modelo base
rf = RandomForestClassifier(random_state=42)

# Configurar y ejecutar el GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_

# Hacer predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Evaluar el modelo
accuracy = accuracy_score(y_test, y_pred)
print("Best parameters found:", grid_search.best_params_)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'X_train' is not defined

- Evaluate your model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Ya tienes y_test y y_pred tras ejecutar el modelo
# en el conjunto de prueba

# Calcular la exactitud
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Mostrar la matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Reporte de clasificación
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calcular y graficar la curva ROC y AUC
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probabilidades de la clase positiva

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()