In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score

import utils

## Predicting Spotify Popularity

Considering that the Billboard Hot 100 position prediction was only kind of successful, I would like to also try to explore whether the Spotify popularity can be accurately predicted from just the audio features and the chart performance. I think that this metric will be easier to identify, as it is not as complex in its nature as the Billboard Hot 100 position. More streams = higher Spotify popularity, while the chart position takes into account many different popularity factors.

I will again read the data, evaluate different models, optimize the best performing model and come with a final conclusion about how effective the result is.

In [4]:
# Load the preprocessed and the original file
data = pd.read_csv("data/preprocessed_data.csv", index_col=0)
songs = pd.read_csv("data/filled_artists_info.csv", index_col=0).dropna()
songs = songs.reset_index(drop=True)

In [5]:
songs = songs[~((songs["artist_followers"] < 10000) & (songs["year"] >= 2000))].reset_index(drop=True)

In [6]:
data

Unnamed: 0,peak_position,weeks_on_chart,year,track_duration_s,danceability,energy,loudness,speechiness,acousticness,instrumentalness,...,valence,tempo,spotify_popularity,artist_popularity,artist_followers,total_songs_for_artist,good_performing_artist,top_10_hit,cluster_hit_pct,mode
0,1.000000,0.573891,1.0,0.370547,0.66750,0.700312,0.675954,0.200735,0.063552,0.000000,...,0.609485,0.240374,0.91,0.80,0.608628,0.000000,0.0,1,0.149368,1.0
1,1.000000,0.508251,1.0,0.396942,0.56250,0.852237,0.680192,0.194118,0.007598,0.000000,...,0.737639,0.567850,0.93,0.91,0.921535,1.000000,1.0,1,0.149368,1.0
2,1.000000,0.526271,1.0,0.684702,0.88750,0.453694,0.577714,0.570588,0.010741,0.000000,...,0.215943,0.389389,0.95,0.91,0.898979,0.879310,1.0,1,0.104945,1.0
3,0.979798,0.573891,1.0,0.386583,0.64125,0.753382,0.650968,0.209559,0.107428,0.158409,...,0.696266,0.409726,0.98,0.94,0.836027,0.068966,0.0,1,0.149368,1.0
4,0.989899,0.543128,1.0,0.304559,0.83000,0.687825,0.648972,0.322794,0.097689,0.376784,...,0.927346,0.629522,0.94,0.78,0.604582,0.017241,0.0,1,0.149368,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28359,0.202020,0.000000,0.0,0.331544,0.43250,0.798127,0.472206,0.546324,0.743975,0.152988,...,0.977800,0.801196,0.75,0.70,0.681877,0.310345,1.0,0,0.168868,1.0
28360,0.191919,0.000000,0.0,0.176417,0.77000,0.661811,0.296394,0.358824,0.780120,0.087601,...,0.955600,0.447692,0.19,0.70,0.681877,0.310345,1.0,0,0.168868,1.0
28361,0.141414,0.000000,0.0,0.300344,0.06750,0.416233,0.551939,0.269853,0.660642,0.000000,...,0.348133,0.143637,0.44,0.56,0.565504,0.741379,1.0,0,0.180021,1.0
28362,0.040404,0.000000,0.0,0.227917,0.49000,0.468262,0.414970,0.255147,0.764056,0.589273,...,0.824420,0.260481,0.21,0.49,0.319055,0.000000,0.0,0,0.172721,0.0


In [7]:
# Separate the features and the target variable
X = data.drop(columns=["top_10_hit", "artist_popularity", "artist_followers", "spotify_popularity"]).reset_index(drop=True)
y = data["spotify_popularity"] # Get the original popularity scores before scaling

## Evaluating Different Models

I will use different regression models with their base parameters to evaluate which one is the best at predicting the Spotify popularity. 
The `evaluate_regression_model` custom function will print out different regression evaluations, which will identify the most valuable model.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42, test_size=0.25)

In [10]:
# Linear Regression
utils.evaluate_regression_model(LinearRegression(), X_train, X_test, y_train, y_test)

Regression Evaluation Metrics:
R-squared: 0.61
Mean Absolute Error (MAE): 0.11
Mean Squared Error (MSE): 0.02
Root Mean Squared Error (RMSE): 0.15


In [11]:
# Random Forest Regressor
utils.evaluate_regression_model(RandomForestRegressor(), X_train, X_test, y_train, y_test)

Regression Evaluation Metrics:
R-squared: 0.69
Mean Absolute Error (MAE): 0.10
Mean Squared Error (MSE): 0.02
Root Mean Squared Error (RMSE): 0.13


In [12]:
# Gradient Boosting Regressor
utils.evaluate_regression_model(GradientBoostingRegressor(), X_train, X_test, y_train, y_test)

Regression Evaluation Metrics:
R-squared: 0.68
Mean Absolute Error (MAE): 0.10
Mean Squared Error (MSE): 0.02
Root Mean Squared Error (RMSE): 0.13


In [13]:
# Ada Boost Regressor
utils.evaluate_regression_model(AdaBoostRegressor(), X_train, X_test, y_train, y_test)

Regression Evaluation Metrics:
R-squared: 0.58
Mean Absolute Error (MAE): 0.12
Mean Squared Error (MSE): 0.02
Root Mean Squared Error (RMSE): 0.15


In [14]:
# SVR
utils.evaluate_regression_model(SVR(), X_train, X_test, y_train, y_test)

Regression Evaluation Metrics:
R-squared: 0.67
Mean Absolute Error (MAE): 0.10
Mean Squared Error (MSE): 0.02
Root Mean Squared Error (RMSE): 0.13


It looks like Random Forest Regressor barely takes over the other models, so that is the one I am going to use.

## Hyperparameter Tuning

Using `RandomSearchCV`, I will try to find the parameters that lead to the best results.

In [18]:
# # Define parameter grid for GridSearchCV
# param_grid = {
#     'n_estimators': [50, 100, 150],           # Number of trees in the forest
#     'max_depth': [None],               # Depth of trees
#     'criterion': ['squared_error', 'absolute_error'],  # Splitting criterion
#     'min_samples_split': [2, 5],                # Min samples to split a node
#     'min_samples_leaf': [1, 2],                      # Min samples at a leaf node
# }

# # Set up GridSearchCV
# grid_search = GridSearchCV(
#     estimator=RandomForestRegressor(random_state=42),
#     param_grid=param_grid,
#     scoring='neg_mean_squared_error',  # Optimize for MSE
#     cv=2,                 # 5-fold cross-validation
#     n_jobs=-1,            # Use all available processors
#     verbose=2,
# )

# # Perform the grid search
# grid_search.fit(X, y)

# # Results
# print("Best Parameters:", grid_search.best_params_)
# print("Best Negative MSE Score:", grid_search.best_score_)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
Best Parameters: {'criterion': 'squared_error', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Best Negative MSE Score: -0.047227793682677655


## Best model evaluation

In [26]:
best_model = RandomForestRegressor(
    criterion='squared_error',
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=150,
    random_state=42
)

# Train the model on the data
best_model.fit(X_train, y_train)

In [33]:
y_pred = best_model.predict(X_test)
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("R-squared (R2) Score:", r2_score(y_test, y_pred))
print("Median Absolute Error:", median_absolute_error(y_test, y_pred))
print("Explained Variance Score:", explained_variance_score(y_test, y_pred))

Mean Squared Error (MSE): 0.016614327609961687
Mean Absolute Error (MAE): 0.09873210229953283
R-squared (R2) Score: 0.6885935838638653
Median Absolute Error: 0.07855824675324674
Explained Variance Score: 0.6889472965379417


The model performs reasonably well with an r2 score of 0.69, indicating that it explains about 69% of the variance in the target variable. The MAE and MSE suggest that, on average, predictions are fairly close to actual values. The median absolute error confirms robustness against outliers.

The model is overall decent, but there is still room for improvement.

I think a conclusion can be made, that even if a little better, the model is still not powerful enough to make very accurate predictions.