Questions:
Which characteristics correlate with popularity
How much popularity can be explained by audio features
Given a song's audio fratures, predict a continuous popularity score from 0 - 100

1. Define target
2. Drop obvious non-features (IDs, names)
3. Handle obvious data issues (invalid rows)
4. Split data (train / val / test)
5. Decide feature set (using TRAIN ONLY)
6. Fit baseline on TRAIN
7. Train model on TRAIN
8. Evaluate on VALIDATION
9. Final evaluation on TEST

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import ast


In [None]:
# Clean the data and drop unusable columns
data = pd.read_csv("data/tracks.csv")
data.dropna(inplace=True)
data.drop(columns=['id', 'name', 'artists', 'release_date', 'id_artists'], inplace=True, axis=1)

In [3]:
# Seperate X and y (features and labels)
X = data.drop(columns=['popularity'], axis=1)
y = data['popularity']

# Split the data into 70% train, 15% val, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [4]:
# Establish a baseline model using the mean popularity

mean_popularity = y_train.mean()
prediction_baseline = np.full(shape=y_val.shape, fill_value=mean_popularity)
r2_baseline = r2_score(y_val, prediction_baseline)
print(f"Baseline R² score: {r2_baseline}")

Baseline R² score: -1.637955591315965e-05


In [5]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
print(f"Validation R² score: {r2_val}")

Validation R² score: 0.22039098020654846


In [6]:
coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': model.coef_})

# coefficients measures how much the predicted popularity changes with a unit change in the feature

print(coef_df.sort_values(by='coefficient', ascending=True))

             feature   coefficient
11           valence -1.296092e+01
8       acousticness -1.195543e+01
9   instrumentalness -9.531479e+00
7        speechiness -4.817586e+00
10          liveness -3.570030e+00
0        duration_ms  8.091281e-07
12             tempo  7.974529e-03
4                key  1.352826e-02
13    time_signature  3.602195e-02
6               mode  6.033576e-02
5           loudness  4.522698e-01
3             energy  2.419937e+00
1           explicit  1.141757e+01
2       danceability  1.597812e+01


In [9]:
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Retrain the model on scaled data
retrained_model = LinearRegression()
retrained_model.fit(X_train_scaled, y_train)
retrained_y_pred = retrained_model.predict(X_val_scaled)
retrained_r2_val = r2_score(y_val, retrained_y_pred)
print(f"Retrained Validation R² score: {retrained_r2_val}")

retrained_coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': retrained_model.coef_})

print(retrained_coef_df.sort_values(by='coefficient', ascending=True))

Retrained Validation R² score: 0.2203909802065488
             feature  coefficient
8       acousticness    -4.172330
11           valence    -3.340417
9   instrumentalness    -2.547864
7        speechiness    -0.868185
10          liveness    -0.657660
13    time_signature     0.017024
6               mode     0.028598
4                key     0.047599
0        duration_ms     0.102153
12             tempo     0.237224
3             energy     0.609872
5           loudness     2.305768
1           explicit     2.345192
2       danceability     2.655595


In [None]:
artists_df = pd.read_csv("data/artists.csv")
tracks_df = pd.read_csv("data/tracks.csv")

tracks_df.dropna(inplace=True)
tracks_df.drop(columns=['name', 'release_date'], inplace=True, axis=1)

In [None]:
# New column: artist_popularity_median = median popularity of all tracks by the artist

# Parse id_artists
tracks_df['artists_id_list'] = tracks_df['id_artists'].apply(ast.literal_eval)
tracks_exploded = tracks_df.explode('artists_id_list')

tracks_exploded = tracks_exploded.rename(
    columns={'artists_id_list': 'artist_id'}
)
artists_df = artists_df.rename(columns={
   'id': 'artist_id',
   'popularity': 'artist_popularity'
})

tracks_enriched = tracks_exploded.merge(
   artists_df[['artist_id', 'artist_popularity']],
   on='artist_id',
   how='left'
)


In [None]:
tracks_enriched = tracks_enriched.dropna(subset=['artist_popularity'])

Unnamed: 0,id,popularity,duration_ms,explicit,artists,id_artists,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_id,artist_popularity
0,35iwgR4jXetI318WEWsa1Q,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3,45tIt06XoI0Iio4LBEVpls,4.0
1,021ht4sdgPcrDgSk7JTbKY,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1,14jtPCOoNZwquk5wd9DxrY,0.0
2,07A5yehtSnoedViJAZkNnc,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5,5LiOoJbxVSAMkBS2fUm3X2,23.0
3,08FmqUhxtyLTn6pAh6bk45,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3,5LiOoJbxVSAMkBS2fUm3X2,23.0
4,08y9GfoqCWfOGsKdwojr5e,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4,3BiJGZsyX9sJchTqcSA7Su,35.0
