## Random Forest Hyperparameter Tuning

In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
import optuna

In [2]:
df = pd.read_csv('spot_30k_clean.csv', index_col=0)
df.head()

Unnamed: 0,y,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,66,0.748,0.916,-2.634,0.0583,0.102,0.0,0.0653,0.518,122.036,...,0,0,0,0,0,0,0,1,0,0
1,67,0.726,0.815,-4.969,0.0373,0.0724,0.00421,0.357,0.693,99.972,...,1,0,0,0,0,0,0,0,0,0
2,70,0.675,0.931,-3.432,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,...,0,0,1,0,0,0,0,0,0,0
3,60,0.718,0.93,-3.778,0.102,0.0287,9e-06,0.204,0.277,121.956,...,0,0,0,0,0,0,0,0,1,0
4,69,0.65,0.833,-4.672,0.0359,0.0803,0.0,0.0833,0.725,123.976,...,0,0,1,0,0,0,0,0,0,0


In [3]:
df_num = df[['duration', 'loudness', 'energy', 'tempo', 'instrumentalness', 
             'speechiness', 'danceability', 'valence', 'acousticness', 'liveness']]

X_train, X_test, y_train, y_test = train_test_split(df_num, df.y, test_size=0.2, random_state=42)

In [10]:
#More robust hyperparameter tuning, without max features
def objective(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
        n_jobs=-1)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae = make_scorer(mean_absolute_error)
    scores = cross_val_score(model, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
    return np.min([np.mean(scores), np.median([scores])])

In [11]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-04-01 12:52:39,690] A new study created in memory with name: no-name-0900b897-eff5-4fa5-9022-412d69c9d21c


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-04-01 13:04:42,369] Trial 0 finished with value: 18.898404286862498 and parameters: {'n_estimators': 1081, 'max_depth': 24, 'min_samples_split': 9, 'min_samples_leaf': 21}. Best is trial 0 with value: 18.898404286862498.
[I 2024-04-01 13:18:51,977] Trial 1 finished with value: 18.57098047332963 and parameters: {'n_estimators': 1134, 'max_depth': 16, 'min_samples_split': 8, 'min_samples_leaf': 11}. Best is trial 1 with value: 18.57098047332963.
[I 2024-04-01 13:21:27,672] Trial 2 finished with value: 20.13755199882161 and parameters: {'n_estimators': 855, 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 22}. Best is trial 1 with value: 18.57098047332963.
[I 2024-04-01 13:25:16,905] Trial 3 finished with value: 19.692120500433198 and parameters: {'n_estimators': 619, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 1 with value: 18.57098047332963.
[I 2024-04-01 13:47:42,458] Trial 4 finished with value: 18.359667175984843 and parameters: {

In [12]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=90, state=1, values=[16.83306590984513], datetime_start=datetime.datetime(2024, 4, 3, 11, 41, 48, 15366), datetime_complete=datetime.datetime(2024, 4, 3, 12, 10, 55, 229637), params={'n_estimators': 1554, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=2000, log=False, low=100, step=1), 'max_depth': IntDistribution(high=50, log=False, low=2, step=1), 'min_samples_split': IntDistribution(high=32, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=32, log=False, low=1, step=1)}, trial_id=90, value=None)
Best hyperparameters: {'n_estimators': 1554, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 1}


In [19]:
#Best model
rf = RandomForestRegressor(n_estimators=1554, max_depth=39, random_state=42, n_jobs=-1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae = make_scorer(mean_absolute_error)
scores = cross_val_score(rf, X_train, y_train, scoring=mae, cv=kf, n_jobs=-1)
np.mean(scores)

16.860438380260636

## Feature Scaling

In [16]:
def scale_and_cv(model):
    
    scalers = [StandardScaler(), MinMaxScaler(), RobustScaler(), PowerTransformer()]
    
    for i in range(4):
        scalers[i].fit(X_train)
        X_train_scaled = scalers[i].transform(X_train)
    
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        mae = make_scorer(mean_absolute_error)
        scores = cross_val_score(model, X_train_scaled, y_train, scoring=mae, cv=kf)
        print(scalers[i], np.min([np.mean(scores), np.median([scores])]))

In [17]:
scale_and_cv(rf)

StandardScaler() 16.833498762269983
MinMaxScaler() 16.838821309702674
RobustScaler() 16.837613407626783
PowerTransformer() 16.838369810674028
