## Gradient Boosting Hyperparamter Tuning

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
import optuna

In [3]:
df = pd.read_csv('spot_30k_clean.csv', index_col=0)
df.head()

Unnamed: 0,y,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,66,0.748,0.916,-2.634,0.0583,0.102,0.0,0.0653,0.518,122.036,...,0,0,0,0,0,0,0,1,0,0
1,67,0.726,0.815,-4.969,0.0373,0.0724,0.00421,0.357,0.693,99.972,...,1,0,0,0,0,0,0,0,0,0
2,70,0.675,0.931,-3.432,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,...,0,0,1,0,0,0,0,0,0,0
3,60,0.718,0.93,-3.778,0.102,0.0287,9e-06,0.204,0.277,121.956,...,0,0,0,0,0,0,0,0,1,0
4,69,0.65,0.833,-4.672,0.0359,0.0803,0.0,0.0833,0.725,123.976,...,0,0,1,0,0,0,0,0,0,0


In [4]:
df_num = df[['duration', 'loudness', 'energy', 'tempo', 'instrumentalness', 
             'speechiness', 'danceability', 'valence', 'acousticness', 'liveness']]

X_train, X_test, y_train, y_test = train_test_split(df_num, df.y, test_size=0.2, random_state=42)

In [18]:
#Hyperparameter tuning
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

    model = GradientBoostingRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae = make_scorer(mean_absolute_error)
    scores = cross_val_score(model, X_train, y_train, scoring=mae, cv=kf)
    return np.min([np.mean(scores), np.median([scores])])

In [20]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=True)

[I 2024-04-04 13:44:37,666] A new study created in memory with name: no-name-0b290cfc-2904-43ae-a8c0-eeb91243d8c3


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2024-04-04 13:58:19,978] Trial 0 finished with value: 16.796608136395932 and parameters: {'n_estimators': 657, 'learning_rate': 0.17213162542313915, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 19}. Best is trial 0 with value: 16.796608136395932.
[I 2024-04-04 14:30:36,212] Trial 1 finished with value: 17.042907894983937 and parameters: {'n_estimators': 1121, 'learning_rate': 0.24021517386398597, 'max_depth': 20, 'min_samples_split': 12, 'min_samples_leaf': 24}. Best is trial 0 with value: 16.796608136395932.
[I 2024-04-04 14:44:29,601] Trial 2 finished with value: 17.23265621156765 and parameters: {'n_estimators': 919, 'learning_rate': 0.06099191490009361, 'max_depth': 11, 'min_samples_split': 15, 'min_samples_leaf': 21}. Best is trial 0 with value: 16.796608136395932.
[I 2024-04-04 14:49:48,202] Trial 3 finished with value: 17.890237835941793 and parameters: {'n_estimators': 202, 'learning_rate': 0.021024176048139834, 'max_depth': 28, 'min_samples_split': 12, 'min_

In [21]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=0, state=1, values=[16.796608136395932], datetime_start=datetime.datetime(2024, 4, 4, 13, 44, 37, 680087), datetime_complete=datetime.datetime(2024, 4, 4, 13, 58, 19, 978687), params={'n_estimators': 657, 'learning_rate': 0.17213162542313915, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 19}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=2000, log=False, low=100, step=1), 'learning_rate': FloatDistribution(high=1.0, log=False, low=0.001, step=None), 'max_depth': IntDistribution(high=50, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=32, log=False, low=1, step=1)}, trial_id=0, value=None)
Best hyperparameters: {'n_estimators': 657, 'learning_rate': 0.17213162542313915, 'max_depth': 16, 'min_samples_split': 6, 'min_samples_leaf': 19}


In [6]:
#Best model
gb = GradientBoostingRegressor(n_estimators=657, learning_rate=0.17213162542313915, max_depth=16, 
                               min_samples_split=6, min_samples_leaf=19, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae = make_scorer(mean_absolute_error)
scores = cross_val_score(gb, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
np.mean(scores)

16.81402824291245

## Feature Scaling

In [7]:
def scale_and_cv(model):
    
    scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), PowerTransformer()]
    
    for i in range(4):
        scalers[i].fit(X_train)
        X_train_scaled = scalers[i].transform(X_train)
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mae = make_scorer(mean_absolute_error)
        scores = cross_val_score(model, X_train_scaled, y_train, scoring=mae, cv=kf, n_jobs=-1)
        print(scalers[i], np.min([np.mean(scores), np.median([scores])]))

In [8]:
scale_and_cv(gb)

StandardScaler() 16.833807639393186
MinMaxScaler() 16.781766008132358
RobustScaler() 16.80563668461532
PowerTransformer() 16.802237254572574
