## Hist Gradient Boosting Hyperparamter Tuning

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
import optuna

In [2]:
df = pd.read_csv('spot_30k_clean.csv', index_col=0)
df.head()

Unnamed: 0,y,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,66,0.748,0.916,-2.634,0.0583,0.102,0.0,0.0653,0.518,122.036,...,0,0,0,0,0,0,0,1,0,0
1,67,0.726,0.815,-4.969,0.0373,0.0724,0.00421,0.357,0.693,99.972,...,1,0,0,0,0,0,0,0,0,0
2,70,0.675,0.931,-3.432,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,...,0,0,1,0,0,0,0,0,0,0
3,60,0.718,0.93,-3.778,0.102,0.0287,9e-06,0.204,0.277,121.956,...,0,0,0,0,0,0,0,0,1,0
4,69,0.65,0.833,-4.672,0.0359,0.0803,0.0,0.0833,0.725,123.976,...,0,0,1,0,0,0,0,0,0,0


In [3]:
df_num = df[['duration', 'loudness', 'energy', 'tempo', 'instrumentalness', 
             'speechiness', 'danceability', 'valence', 'acousticness', 'liveness']]

X_train, X_test, y_train, y_test = train_test_split(df_num, df.y, test_size=0.2, random_state=42)

In [6]:
#Hyperparameter tuning
def objective(trial):

    learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0)
    max_iter = trial.suggest_int('max_iter', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 20, 100)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 30, 150)
    l2_regularization = trial.suggest_float('l2_regularization', 0.0, 0.1)

    model = HistGradientBoostingRegressor(
        loss='absolute_error',
        learning_rate=learning_rate,
        max_iter=max_iter,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes,
        l2_regularization=l2_regularization,
        random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae = make_scorer(mean_absolute_error)
    scores = cross_val_score(model, X_train, y_train, scoring=mae, cv=kf)
    return np.min([np.mean(scores), np.median([scores])])

In [7]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-04-04 18:05:47,267] A new study created in memory with name: no-name-cbd01d8f-1a94-418e-9f55-ba34c8c8a932


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-04-04 18:05:47,825] Trial 0 finished with value: 19.560544780730577 and parameters: {'learning_rate': 0.6680454682680177, 'max_iter': 1157, 'max_depth': 4, 'min_samples_leaf': 73, 'max_leaf_nodes': 76, 'l2_regularization': 0.09075185481869136}. Best is trial 0 with value: 19.560544780730577.
[I 2024-04-04 18:05:49,465] Trial 1 finished with value: 19.747235755975858 and parameters: {'learning_rate': 0.7764482864560267, 'max_iter': 1299, 'max_depth': 44, 'min_samples_leaf': 59, 'max_leaf_nodes': 110, 'l2_regularization': 0.012606861684323123}. Best is trial 0 with value: 19.560544780730577.
[I 2024-04-04 18:06:55,974] Trial 2 finished with value: 18.062090100963587 and parameters: {'learning_rate': 0.03788821791151272, 'max_iter': 1213, 'max_depth': 45, 'min_samples_leaf': 83, 'max_leaf_nodes': 137, 'l2_regularization': 0.0981893464365658}. Best is trial 2 with value: 18.062090100963587.
[I 2024-04-04 18:07:02,468] Trial 3 finished with value: 18.461707727398057 and parameters: 

In [8]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=97, state=1, values=[17.72855372592357], datetime_start=datetime.datetime(2024, 4, 4, 18, 41, 56, 672962), datetime_complete=datetime.datetime(2024, 4, 4, 18, 42, 42, 298964), params={'learning_rate': 0.05210537299041002, 'max_iter': 1253, 'max_depth': 27, 'min_samples_leaf': 31, 'max_leaf_nodes': 148, 'l2_regularization': 0.044204117745388065}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=False, low=0.001, step=None), 'max_iter': IntDistribution(high=2000, log=False, low=100, step=1), 'max_depth': IntDistribution(high=50, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=100, log=False, low=20, step=1), 'max_leaf_nodes': IntDistribution(high=150, log=False, low=30, step=1), 'l2_regularization': FloatDistribution(high=0.1, log=False, low=0.0, step=None)}, trial_id=97, value=None)
Best hyperparameters: {'learning_rate': 0.05210537299041002, 'max_iter': 1253, 'max_d

In [5]:
#Best model
hgb = HistGradientBoostingRegressor(learning_rate=0.05210537299041002, max_iter=1253, max_depth=27, min_samples_leaf=31, 
                                    max_leaf_nodes=148, l2_regularization=0.044204117745388065, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae = make_scorer(mean_absolute_error)
scores = cross_val_score(hgb, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
np.mean(scores)

18.241716652500916

## Feature Scaling

In [6]:
def scale_and_cv(model):
    
    scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), PowerTransformer()]
    
    for i in range(4):
        scalers[i].fit(X_train)
        X_train_scaled = scalers[i].transform(X_train)
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mae = make_scorer(mean_absolute_error)
        scores = cross_val_score(model, X_train_scaled, y_train, scoring=mae, cv=kf, n_jobs=-1)
        print(scalers[i], np.min([np.mean(scores), np.median([scores])]))

In [7]:
scale_and_cv(hgb)

StandardScaler() 18.241682593273545
MinMaxScaler() 18.241691979938487
RobustScaler() 18.241643068804468
PowerTransformer() 18.241714131650678
