In [1]:
from pathlib import Path

import kagglehub
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

Download the supplementary data from Kaggle for artist info

In [2]:
if Path("data/spotify_artist_data.csv").exists():
    artist_stats = pd.read_csv("data/spotify_artist_data.csv")
else:
    path = kagglehub.dataset_download("adnananam/spotify-artist-stats")
    artist_stats = pd.read_csv(path + "/spotify_artist_data.csv", index_col=0)

    # Remove error rows b/c the creator didn't process correctly
    artist_stats = artist_stats[artist_stats["Lead Streams"] != "Lead Streams"]

    # Cast numeric columns to int
    for col in ["Lead Streams", "Feats", "Tracks", "One Billion", "100 Million"]:
        artist_stats[col] = artist_stats[col].str.replace(",", "").astype(int)

    # Remove the last updated column, it's not useful/relevant
    artist_stats = artist_stats.drop(columns=["Last Updated"])

    artist_stats.to_csv("data/spotify_artist_data.csv", index=False)

artist_stats.head()

Unnamed: 0,Artist Name,Lead Streams,Feats,Tracks,One Billion,100 Million
0,Drake,50162292808,19246513666,262,6,130
1,Bad Bunny,44369032140,5391990975,163,5,118
2,Ed Sheeran,38153682361,2791278201,240,10,62
3,The Weeknd,34767779741,4288903657,186,8,72
4,Taylor Swift,32596728109,424053296,323,1,96


In [3]:
path = kagglehub.dataset_download("pieca111/music-artists-popularity")
art_pop = pd.read_csv(path + "/artists.csv", low_memory=False)

In [5]:
# all_artists = set(df_nodupe["artists"].str.cat(sep=";").split(";"))
# not_pres = []
# art_pop_artists = set(art_pop["artist_mb"].values)

# for artist in tqdm(all_artists):
#     if artist not in art_pop_artists:
#         not_pres.append(artist)

# len(not_pres) / len(all_artists) * 100  # Percentage of artists not present in the dataset

In [6]:
art_pop

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
0,cc197bad-dc9c-440d-a5b5-d52ba2e14234,Coldplay,Coldplay,United Kingdom,United Kingdom,rock; pop; alternative rock; british; uk; brit...,rock; alternative; britpop; alternative rock; ...,5381567.0,360111850.0,False
1,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,Radiohead,United Kingdom,United Kingdom,rock; electronic; alternative rock; british; g...,alternative; alternative rock; rock; indie; el...,4732528.0,499548797.0,False
2,8bfac288-ccc5-448d-9573-c33ea2aa5c30,Red Hot Chili Peppers,Red Hot Chili Peppers,United States,United States,rock; alternative rock; 80s; 90s; rap; metal; ...,rock; alternative rock; alternative; Funk Rock...,4620835.0,293784041.0,False
3,73e5e69d-3554-40d8-8516-00cb38737a1c,Rihanna,Rihanna,United States,Barbados; United States,pop; dance; hip hop; reggae; contemporary r b;...,pop; rnb; female vocalists; dance; Hip-Hop; Ri...,4558193.0,199248986.0,False
4,b95ce3ff-3d05-4e87-9e01-c97b66af13d4,Eminem,Eminem,United States,United States,turkish; rap; american; hip-hop; hip hop; hiph...,rap; Hip-Hop; Eminem; hip hop; pop; american; ...,4517997.0,199507511.0,False
...,...,...,...,...,...,...,...,...,...,...
1466078,1eab523e-98ff-4083-aa34-8922740bc696,정은지,,South Korea,South Korea,,,,,False
1466079,a18f0527-907e-42b0-8268-504966274581,남태현,,South Korea,,,,,,False
1466080,20a57e37-24b5-4301-855b-35076580fb88,헤일로,,South Korea,,,,,,False
1466081,83891a4d-1bf4-4abe-a483-5b3d9d614efa,서현진,,South Korea,South Korea,,,,,False


Download the dataset from HuggingFace using Pandas, and drop the extra index column. The `na`/`NaN` values were dropped from the `artists` column because that column is used to merge the supplementary data above with the main dataset.

In [7]:
# Pulled dataset from HF, dropped unneeded index column
if Path("data/spotify_tracks.csv").exists():
    df = pd.read_csv("data/spotify_tracks.csv")
else:
    df = (
        pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")
        .drop("Unnamed: 0", axis=1)
        .dropna(subset=["artists"])
    )

    df["duration_s"] = df["duration_ms"] / 1000
    df = df.drop(columns=["duration_ms"])  # Drop original duration column, keep seconds

    df.to_csv("data/spotify_tracks.csv", index=False)

df_nodupe = df.drop_duplicates(subset=["track_id"]).copy()

df.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_s
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic,230.666
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic,149.61
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic,210.826
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic,201.933
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic,198.853


Adding in more information to the main dataset using each artist's stats. If there are two or more artists present, the stats are averaged.

In [8]:
if not Path("data/spotify_tracks_processed.csv").exists():
    art_stats_name = set(artist_stats["Artist Name"].values)
    
    for row in tqdm(df_nodupe.iterrows(), total=df_nodupe.shape[0], desc="Processing rows"):
        artists = [x.strip() for x in row[1]["artists"].split(";")]
        lead_streams, feats, tracks, one_billion, hundred_million = [], [], [], [], []

        for artist in artists:
            if artist in art_stats_name:
                lead_streams.append(
                    artist_stats[artist_stats["Artist Name"] == artist]["Lead Streams"].values[0]
                )
                feats.append(artist_stats[artist_stats["Artist Name"] == artist]["Feats"].values[0])
                tracks.append(artist_stats[artist_stats["Artist Name"] == artist]["Tracks"].values[0])
                one_billion.append(
                    artist_stats[artist_stats["Artist Name"] == artist]["One Billion"].values[0]
                )
                hundred_million.append(
                    artist_stats[artist_stats["Artist Name"] == artist]["100 Million"].values[0]
                )

        for col, var in zip(
            ["lead_streams", "featured_streams", "featured_tracks", "one_billion", "hundred_million"],
            [lead_streams, feats, tracks, one_billion, hundred_million],
            strict=True,
        ):
            if len(var) == 0:
                var.append(0)

            df_nodupe.loc[df_nodupe["track_id"] == row[1]["track_id"], col] = (
                np.mean(var) if len(var) > 0 else 0
            )

    g_dummy = pd.get_dummies(df["track_genre"]).groupby(df["track_id"]).sum().astype(int).reset_index()

    dummy_val = g_dummy.copy()
    dummy_val["total"] = dummy_val.sum(axis=1, numeric_only=True)
    dummy_val = dummy_val[["track_id", "total"]].sort_values("track_id", ascending=True)

    process_check = (
        df.groupby("track_id")
        .size()
        .to_frame("total")
        .reset_index()
        .sort_values("track_id", ascending=True)
    )

    for df1, df2 in zip(process_check.iterrows(), dummy_val.iterrows(), strict=True):
        assert (df1[1]["total"] == df2[1]["total"]) and (df1[1]["track_id"] == df2[1]["track_id"])

    df = df_nodupe.merge(g_dummy, on="track_id").drop(
        ["track_id", "artists", "album_name", "track_name", "track_genre"], axis=1
    )
    df["explicit"] = df["explicit"].astype(int)
    df.to_csv("data/spotify_tracks_processed.csv", index=False)

else:
    df = pd.read_csv("data/spotify_tracks_processed.csv")

In [9]:
df.describe()

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,spanish,study,swedish,synth-pop,tango,techno,trance,trip-hop,turkish,world-music
count,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,...,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0,89740.0
mean,33.198808,0.085848,0.562166,0.634458,5.28353,-8.498994,0.636973,0.087442,0.328285,0.173415,...,0.011143,0.011143,0.011143,0.011143,0.011143,0.011143,0.011143,0.011143,0.011143,0.011143
std,20.58064,0.280141,0.176692,0.256606,3.559912,5.221518,0.480875,0.113278,0.338321,0.323849,...,0.104973,0.105185,0.104973,0.104973,0.104973,0.104973,0.105079,0.105291,0.105079,0.105079
min,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.45,0.457,2.0,-10.32225,0.0,0.036,0.0171,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,33.0,0.0,0.576,0.676,5.0,-7.185,1.0,0.0489,0.188,5.8e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,49.0,0.0,0.692,0.853,8.0,-5.108,1.0,0.0859,0.625,0.097625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,100.0,1.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,...,1.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0


In [10]:
df.corr()["popularity"].sort_values(ascending=False)

popularity          1.000000
pop-film            0.134407
k-pop               0.122339
hundred_million     0.106079
chill               0.105386
                      ...   
detroit-techno     -0.113376
latin              -0.127165
instrumentalness   -0.127477
romance            -0.141027
iranian            -0.157936
Name: popularity, Length: 134, dtype: float64

In [11]:
# Create mask for rows where lead_streams is 0
mask = df['lead_streams'] == 0

# Split data into features (X) and target (y)
X_train = df[~mask].drop(['lead_streams', 'popularity'], axis=1)
y_train = df[~mask]['lead_streams']

# Prepare features for prediction
X_pred = df[mask].drop(['lead_streams', 'popularity'], axis=1)

# Initialize and train the RandomForestRegressor
rf_model = RandomForestRegressor(
    n_estimators=200, 
    random_state=42, 
    n_jobs=-1, 
    max_features='sqrt',
    verbose=1
)
rf_model.fit(X_train, y_train)

# Make predictions for empty values
predictions = rf_model.predict(X_pred)

# Fill in the empty values
df.loc[mask, 'lead_streams'] = predictions

# Verify no more zeros in lead_streams
print(f"Number of zeros in lead_streams: {(df['lead_streams'] == 0).sum()}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s


Number of zeros in lead_streams: 0


[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.2s finished


In [12]:
std_df = StandardScaler().fit_transform(df[df.columns.difference(["popularity"])])
kmeans = KMeans(n_clusters=40, random_state=42)
kmeans.fit(std_df)
df["cluster"] = kmeans.labels_
df["cluster"] = df["cluster"].astype("category")

df

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,...,study,swedish,synth-pop,tango,techno,trance,trip-hop,turkish,world-music,cluster
0,73,0,0.676,0.4610,1,-6.746,0,0.1430,0.0322,0.000001,...,0,0,0,0,0,0,0,0,0,18
1,55,0,0.420,0.1660,1,-17.235,1,0.0763,0.9240,0.000006,...,0,0,0,0,0,0,0,0,0,7
2,57,0,0.438,0.3590,0,-9.734,1,0.0557,0.2100,0.000000,...,0,0,0,0,0,0,0,0,0,33
3,71,0,0.266,0.0596,0,-18.515,1,0.0363,0.9050,0.000071,...,0,0,0,0,0,0,0,0,0,33
4,82,0,0.618,0.4430,2,-9.681,1,0.0526,0.4690,0.000000,...,0,0,0,0,0,0,0,0,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89735,21,0,0.172,0.2350,5,-16.393,1,0.0422,0.6400,0.928000,...,0,0,0,0,0,0,0,0,1,33
89736,22,0,0.174,0.1170,0,-18.318,0,0.0401,0.9940,0.976000,...,0,0,0,0,0,0,0,0,1,33
89737,22,0,0.629,0.3290,0,-10.895,0,0.0420,0.8670,0.000000,...,0,0,0,0,0,0,0,0,1,39
89738,41,0,0.587,0.5060,7,-10.889,1,0.0297,0.3810,0.000000,...,0,0,0,0,0,0,0,0,1,39


In [13]:
dfc = df.corr()

# Create mask for correlations > abs(0.50)
mask = np.abs(dfc) > 0.50

# Get upper triangle of mask to avoid duplicates
mask_upper = np.triu(mask, k=1)

# Find correlation pairs exceeding threshold
high_corr = []
for i in range(len(dfc.columns)):
    for j in range(i + 1, len(dfc.columns)):
        if mask_upper[i, j]:
            high_corr.append({"var1": dfc.columns[i], "var2": dfc.columns[j], "corr": dfc.iloc[i, j]})

# Convert to dataframe and sort by absolute correlation
high_corr_df = pd.DataFrame(high_corr)
high_corr_df = high_corr_df.sort_values("corr", key=abs, ascending=False)

print("Correlations > |0.50|:")
print(high_corr_df.to_string(index=False))

Correlations > |0.50|:
             var1             var2      corr
singer-songwriter       songwriter  1.000000
     lead_streams  hundred_million  0.952045
     lead_streams      one_billion  0.822557
           reggae        reggaeton  0.801791
           energy         loudness  0.758774
           latino        reggaeton  0.736928
           energy     acousticness -0.732569
              dub          dubstep  0.723472
      one_billion  hundred_million  0.706854
             punk        punk-rock  0.624188
      speechiness           comedy  0.623655
              edm            house  0.619816
           latino           reggae  0.614418
     lead_streams featured_streams  0.612210
 featured_streams  hundred_million  0.593427
            latin           latino  0.590402
         alt-rock      alternative  0.588235
  featured_tracks        classical  0.583836
         loudness     acousticness -0.582664
            indie        indie-pop  0.573530
 featured_streams      one_billi

In [14]:
# Assuming 'df' is your DataFrame and 'features' is a list of feature column names
X = df[df.columns.difference(["popularity"])]
y = df["popularity"]

# Split the dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the RandomForestRegressor model
model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1, verbose=1, max_features="sqrt", bootstrap=True
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    5.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.4s finished


In [15]:
# Calculate R² and MSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Output the results
print(f"R²: {r2}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

R²: 0.559583518284249
MSE: 185.3493406756414
RMSE: 13.614306470608094
MAE: 9.1960334774148


In [16]:
top_n = 50

# Get feature importances and column names
feature_importances = pd.DataFrame(
    {"feature": X_train.columns, "importance": model.feature_importances_}
)

# Sort by importance and get top_n
top_features = feature_importances.sort_values("importance", ascending=False).head(top_n)

# Display results
print(f"Top {top_n} most important features:")
print(top_features.to_string(index=False))

Top 50 most important features:
         feature  importance
    lead_streams    0.059791
      duration_s    0.047070
    acousticness    0.046878
    danceability    0.045851
        loudness    0.044578
          energy    0.043816
         valence    0.043696
         cluster    0.043553
     speechiness    0.042731
           tempo    0.040899
        liveness    0.039540
instrumentalness    0.037467
             key    0.025275
 featured_tracks    0.023615
 hundred_million    0.022847
         iranian    0.018677
featured_streams    0.018422
         romance    0.016027
           latin    0.015987
            rock    0.011267
         hip-hop    0.010885
   chicago-house    0.009878
            kids    0.009662
           chill    0.009485
        pop-film    0.009320
  detroit-techno    0.009132
           k-pop    0.008283
            soul    0.008120
             pop    0.007875
            mode    0.007714
     one_billion    0.007297
           dance    0.007222
         co

In [17]:
# Assuming 'df' is your DataFrame and 'features' is a list of feature column names
X = df[top_features["feature"].to_list()]
y = df["popularity"]

# Split the dataset into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initialize the RandomForestRegressor model
model = RandomForestRegressor(
    n_estimators=200, random_state=42, n_jobs=-1, verbose=1, max_features="sqrt", bootstrap=True
)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    4.8s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 200 out of 200 | elapsed:    0.3s finished


In [18]:
# Calculate R² and MSE
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Output the results
print(f"R²: {r2}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

R²: 0.5274444276377195
MSE: 198.87508166980706
RMSE: 14.10230767178929
MAE: 9.811953400818483


Can skip stacked model maybe, its performance is only slightly above the RFRs.

```md
Stacked Model R^2: 0.537192088407299
Stacked Model MSE: 194.77277721078033
Stacked Model RMSE: 13.956101791359231
Stacked Model MAE: 9.468840211962455
```

In [19]:
# # Splitting the data
# X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
#     df[top_features["feature"].to_list()], df["popularity"], test_size=0.3, random_state=42
# )

# # Base models
# base_models = [
#     ("rf", RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)),
#     ("gb", GradientBoostingRegressor(n_estimators=200, random_state=42)),
#     ("lr", LinearRegression()),
# ]

# # Meta-model (Level 2 model)
# meta_model = LinearRegression()

# # Stacking model
# stacked_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)
# stacked_model.fit(X_train_scaled, y_train)

# # Predictions and MSE
# y_pred_stacked = stacked_model.predict(X_test_scaled)
# mse_stacked = mean_squared_error(y_test, y_pred_stacked)
# print(f"Stacked Model MSE: {mse_stacked}")


In [None]:
# r2 = r2_score(y_test, y_pred_stacked)
# mse = mean_squared_error(y_test, y_pred_stacked)
# rmse = root_mean_squared_error(y_test, y_pred_stacked)
# mae = mean_absolute_error(y_test, y_pred_stacked)

# print(f"Stacked Model R^2: {r2}")
# print(f"Stacked Model MSE: {mse}")
# print(f"Stacked Model RMSE: {rmse}")
# print(f"Stacked Model MAE: {mae}")
