## Load Libraries

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import optuna

## Load Data

In [2]:
df = pd.read_csv('spot_30k_clean.csv', index_col=0)
df.head()

Unnamed: 0,y,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,66,0.748,0.916,-2.634,0.0583,0.102,0.0,0.0653,0.518,122.036,...,0,0,0,0,0,0,0,1,0,0
1,67,0.726,0.815,-4.969,0.0373,0.0724,0.00421,0.357,0.693,99.972,...,1,0,0,0,0,0,0,0,0,0
2,70,0.675,0.931,-3.432,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,...,0,0,1,0,0,0,0,0,0,0
3,60,0.718,0.93,-3.778,0.102,0.0287,9e-06,0.204,0.277,121.956,...,0,0,0,0,0,0,0,0,1,0
4,69,0.65,0.833,-4.672,0.0359,0.0803,0.0,0.0833,0.725,123.976,...,0,0,1,0,0,0,0,0,0,0


## Train/Test Split

In [8]:
X = df.drop(columns='y')
y = df.y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
print(f'Train shape: {X_train.shape, y_train.shape}')
print(f'Test shape: {X_test.shape, y_test.shape}')

Train shape: ((26266, 53), (26266,))
Test shape: ((6567, 53), (6567,))


In [10]:
# Declare function to make predictions on the train and test sets and measure the mean absolute error of each

def measure(model):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    print('Train, Test MAE: ', round(mean_absolute_error(y_train, train_pred), 2), round(mean_absolute_error(y_test, test_pred), 2))

In [11]:
# Set benchmark MAE by predicting the mean popularity

dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[42.54633366]])

In [12]:
measure(dumb_reg)

Train, Test MAE:  20.91 20.88


## Random Forest Regressor

In [13]:
# Untuned Random Forest Regressor

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
measure(rf)

Train, Test MAE:  6.39 17.0


In [14]:
# Define objective function
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Create and fit random forest model
    model = RandomForestRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    min_samples_split=min_samples_split,
    min_samples_leaf=min_samples_leaf,
    random_state=42,
    )
    model.fit(X_train, y_train)

    # Make predictions and calculate MAE
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)

    # Return MAE
    return mae

In [15]:
# Create study object
study = optuna.create_study(direction="minimize")

# Run optimization process
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-03-29 17:36:51,722] A new study created in memory with name: no-name-0103c288-9e82-4946-bde1-faba3402401a


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-03-29 17:37:39,875] Trial 0 finished with value: 20.008218072172376 and parameters: {'n_estimators': 901, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 6}. Best is trial 0 with value: 20.008218072172376.
[I 2024-03-29 17:38:46,414] Trial 1 finished with value: 19.807955339675974 and parameters: {'n_estimators': 941, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 1 with value: 19.807955339675974.
[I 2024-03-29 17:39:39,181] Trial 2 finished with value: 17.743435144601406 and parameters: {'n_estimators': 173, 'max_depth': 28, 'min_samples_split': 8, 'min_samples_leaf': 8}. Best is trial 2 with value: 17.743435144601406.
[I 2024-03-29 17:40:18,828] Trial 3 finished with value: 17.797720507089647 and parameters: {'n_estimators': 132, 'max_depth': 30, 'min_samples_split': 9, 'min_samples_leaf': 9}. Best is trial 2 with value: 17.743435144601406.
[I 2024-03-29 17:40:55,774] Trial 4 finished with value: 17.797465545968123 and parameters: {'

KeyboardInterrupt: 

In [None]:
# Print best trial and best hyperparameters
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

In [None]:
# Train the final model with the best hyperparameters
rf_best = RandomForestRegressor(**best_params)
rf_best.fit(X_train, y_train)

In [None]:
rf_cv_results = cross_validate(rf_best, X_train, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
rf_mae_mean = np.mean(-1 * rf_cv_results['test_score'])
rf_mae_std = np.std(-1 * rf_cv_results['test_score'])
rf_mae_mean, rf_mae_std