# import

In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

import mlflow

THRESHOLD=50

# load

In [21]:
spotify = pd.read_csv('data/spotify_data.csv')

spotify['is_popular'] = spotify['popularity'] >= THRESHOLD
spotify = spotify[spotify['year'] != 2023]

print(spotify.columns)
spotify.describe()
# spotify.head(10)
# print(spotify)

Index(['Unnamed: 0', 'artist_name', 'track_name', 'track_id', 'popularity',
       'year', 'genre', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature', 'is_popular'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,popularity,year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0,1121115.0
mean,660982.7,18.33009,2011.574,0.5373502,0.6404769,5.28801,-8.958909,0.6350294,0.09296653,0.3207728,0.2512347,0.2232447,0.4566433,121.3677,250313.5,3.885708
std,435741.4,15.80787,6.598368,0.184413,0.2698937,3.554551,5.646161,0.4814221,0.1275915,0.3545309,0.3644375,0.2015088,0.2685501,29.77975,150163.7,0.4678041
min,0.0,0.0,2000.0,0.0,0.0,0.0,-58.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2073.0,0.0
25%,280278.5,5.0,2006.0,0.413,0.455,2.0,-10.814,0.0,0.0371,0.00634,1.03e-06,0.0978,0.227,98.791,181867.0,4.0
50%,560557.0,15.0,2012.0,0.55,0.694,5.0,-7.441,1.0,0.0507,0.146,0.00171,0.134,0.439,121.879,226387.0,4.0
75%,1049246.0,29.0,2017.0,0.677,0.873,8.0,-5.272,1.0,0.089,0.638,0.608,0.292,0.675,139.894,287640.0,4.0
max,1473395.0,94.0,2022.0,0.993,1.0,11.0,6.172,1.0,0.971,0.996,1.0,1.0,1.0,249.993,6000495.0,5.0


# train model

In [27]:
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment("Spotify popular song prediction")
mlflow.autolog()

numeric = spotify['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']
categorical = spotify['artist_name', 'track_name', 'year', 'genre', 'key', 'time_signature']

X = numeric + categorical
y = spotify['is_popular'].copy()

model = RandomForestRegressor()

with mlflow.start_run():
    pipeline = makePipeline(model)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)

2025/05/12 16:22:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/12 16:22:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


KeyError: ('danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms')

# pipeline

In [26]:
def makePipeline(model):

    num_pipe = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler()),
    ])

    cat_pipe = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('scale', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ])

    col_transformer = ColumnTransformer(transformers=[
        ('num_pipe', num_pipe, numeric),
        ('cat_pipe', cat_pipe, categorical),
    ],
        remainder='drop',
        n_jobs=-1,
    )

    return Pipeline([
        ('col_transformer', col_transformer),
        ('model', model),
    ])

# evaluate model

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))