Questions:
Which characteristics correlate with popularity
How much popularity can be explained by audio features
Given a song's audio fratures, predict a continuous popularity score from 0 - 100

1. Define target
2. Drop obvious non-features (IDs, names)
3. Handle obvious data issues (invalid rows)
4. Split data (train / val / test)
5. Decide feature set (using TRAIN ONLY)
6. Fit baseline on TRAIN
7. Train model on TRAIN
8. Evaluate on VALIDATION
9. Final evaluation on TEST

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler


In [5]:
# Clean the data and drop unusable columns
data = pd.read_csv("data/tracks.csv")
pd.isnull(data).sum()
data.dropna(inplace=True)
data.drop(columns=['id', 'name', 'artists', 'release_date', 'id_artists'], inplace=True, axis=1)

In [None]:
# Seperate X and y (features and labels)
X = data.drop(columns=['popularity'], axis=1)
y = data['popularity']

# Split the data into 70% train, 15% val, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
# Establish a baseline model using the mean popularity

mean_popularity = y_train.mean()
prediction_baseline = np.full(shape=y_val.shape, fill_value=mean_popularity)
r2_baseline = r2_score(y_val, prediction_baseline)
print(f"Baseline R² score: {r2_baseline}")

Baseline R² score: -1.637955591315965e-05


In [8]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
print(f"Validation R² score: {r2_val}")

Validation R² score: 0.22039098020654846


In [None]:
coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': model.coef_})

# coefficients measures how much the predicted popularity changes with a unit change in the feature

print(coef_df.sort_values(by='coefficient', ascending=True))

             feature   coefficient
11           valence -1.296092e+01
8       acousticness -1.195543e+01
9   instrumentalness -9.531479e+00
7        speechiness -4.817586e+00
10          liveness -3.570030e+00
0        duration_ms  8.091281e-07
12             tempo  7.974529e-03
4                key  1.352826e-02
13    time_signature  3.602195e-02
6               mode  6.033576e-02
5           loudness  4.522698e-01
3             energy  2.419937e+00
1           explicit  1.141757e+01
2       danceability  1.597812e+01


In [None]:
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)