## XGBoost Hyperparameter Tuning

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
import optuna

In [4]:
df = pd.read_csv('spot_30k_clean.csv', index_col=0)
df.head()

Unnamed: 0,y,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,66,0.748,0.916,-2.634,0.0583,0.102,0.0,0.0653,0.518,122.036,...,0,0,0,0,0,0,0,1,0,0
1,67,0.726,0.815,-4.969,0.0373,0.0724,0.00421,0.357,0.693,99.972,...,1,0,0,0,0,0,0,0,0,0
2,70,0.675,0.931,-3.432,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,...,0,0,1,0,0,0,0,0,0,0
3,60,0.718,0.93,-3.778,0.102,0.0287,9e-06,0.204,0.277,121.956,...,0,0,0,0,0,0,0,0,1,0
4,69,0.65,0.833,-4.672,0.0359,0.0803,0.0,0.0833,0.725,123.976,...,0,0,1,0,0,0,0,0,0,0


In [5]:
df_num = df[['duration', 'loudness', 'energy', 'tempo', 'instrumentalness', 
             'speechiness', 'danceability', 'valence', 'acousticness', 'liveness']]

X_train, X_test, y_train, y_test = train_test_split(df_num, df.y, test_size=0.2, random_state=42)

In [6]:
#Hyperparameter tuning
def objective(trial):

    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    gamma = trial.suggest_float('gamma', 0.0, 1.0)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
        
    model = XGBRegressor(
        learning_rate=learning_rate,
        gamma=gamma,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae = make_scorer(mean_absolute_error)
    scores = cross_val_score(model, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
    return np.min([np.mean(scores), np.median([scores])])

In [7]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-04-10 13:16:28,411] A new study created in memory with name: no-name-762094d5-65df-4ad7-b7d1-f9e3cb211a5d


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-04-10 13:16:34,753] Trial 0 finished with value: 19.230340295014976 and parameters: {'learning_rate': 0.03666464625006048, 'gamma': 0.7815332099819975, 'max_depth': 8, 'min_child_weight': 16, 'subsample': 0.624904815074568, 'colsample_bytree': 0.9095875816842729}. Best is trial 0 with value: 19.230340295014976.
[I 2024-04-10 13:16:37,871] Trial 1 finished with value: 19.072047251554846 and parameters: {'learning_rate': 0.25032765352754743, 'gamma': 0.5441177820876565, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.8728114410231136, 'colsample_bytree': 0.8835348704732734}. Best is trial 1 with value: 19.072047251554846.
[I 2024-04-10 13:16:43,164] Trial 2 finished with value: 18.65130685154379 and parameters: {'learning_rate': 0.23470987610376073, 'gamma': 0.7686426082267421, 'max_depth': 10, 'min_child_weight': 15, 'subsample': 0.5463245339257685, 'colsample_bytree': 0.7451480976692597}. Best is trial 2 with value: 18.65130685154379.
[I 2024-04-10 13:16:48,394] Trial 3 f

In [8]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=35, state=1, values=[17.530376249328242], datetime_start=datetime.datetime(2024, 4, 10, 13, 19, 10, 867224), datetime_complete=datetime.datetime(2024, 4, 10, 13, 19, 16, 625125), params={'learning_rate': 0.22104387821944305, 'gamma': 0.4918595690664665, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.9280489884605818, 'colsample_bytree': 0.8521107556733699}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'gamma': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'max_depth': IntDistribution(high=10, log=False, low=2, step=1), 'min_child_weight': IntDistribution(high=20, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None)}, trial_id=35, value=None)
Best hyperparameters: {'learning_rate': 0.22104387821944305, 'gamma

In [9]:
#Best model
xgb = XGBRegressor(learning_rate=0.22104387821944305, gamma=0.4918595690664665, max_depth=10, min_child_weight=2, 
                   subsample=0.9280489884605818, colsample_bytree=0.8521107556733699, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae = make_scorer(mean_absolute_error)
scores = cross_val_score(xgb, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
np.mean(scores)

17.573077127868885

## Feature Scaling

In [10]:
def scale_and_cv(model):
    
    scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), PowerTransformer()]
    
    for i in range(4):
        scalers[i].fit(X_train)
        X_train_scaled = scalers[i].transform(X_train)
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mae = make_scorer(mean_absolute_error)
        scores = cross_val_score(model, X_train_scaled, y_train, scoring=mae, cv=kf, n_jobs=-1)
        print(scalers[i], np.min([np.mean(scores), np.median([scores])]))

In [11]:
scale_and_cv(xgb)

StandardScaler() 17.65979178907016
MinMaxScaler() 17.607779816836942
RobustScaler() 17.604016469792594
PowerTransformer() 17.551199712485428
