Questions:
Which characteristics correlate with popularity
How much popularity can be explained by audio features
Given a song's audio fratures, predict a continuous popularity score from 0 - 100

1. Define target
2. Drop obvious non-features (IDs, names)
3. Handle obvious data issues (invalid rows)
4. Split data (train / val / test)
5. Decide feature set (using TRAIN ONLY)
6. Fit baseline on TRAIN
7. Train model on TRAIN
8. Evaluate on VALIDATION
9. Final evaluation on TEST

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import ast


In [5]:
# Clean the data and drop unusable columns
data = pd.read_csv("data/tracks.csv")
data.dropna(inplace=True)
data.drop(columns=['id', 'name', 'artists', 'release_date', 'id_artists'], inplace=True, axis=1)

In [6]:
# Seperate X and y (features and labels)
X = data.drop(columns=['popularity'], axis=1)
y = data['popularity']

# Split the data into 70% train, 15% val, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
# Establish a baseline model using the mean popularity

mean_popularity = y_train.mean()
prediction_baseline = np.full(shape=y_val.shape, fill_value=mean_popularity)
r2_baseline = r2_score(y_val, prediction_baseline)
print(f"Baseline R² score: {r2_baseline}")

Baseline R² score: -1.637955591315965e-05


In [8]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
print(f"Validation R² score: {r2_val}")

Validation R² score: 0.22039098020654846


In [9]:
coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': model.coef_})

# coefficients measures how much the predicted popularity changes with a unit change in the feature

print(coef_df.sort_values(by='coefficient', ascending=True))

             feature   coefficient
11           valence -1.296092e+01
8       acousticness -1.195543e+01
9   instrumentalness -9.531479e+00
7        speechiness -4.817586e+00
10          liveness -3.570030e+00
0        duration_ms  8.091281e-07
12             tempo  7.974529e-03
4                key  1.352826e-02
13    time_signature  3.602195e-02
6               mode  6.033576e-02
5           loudness  4.522698e-01
3             energy  2.419937e+00
1           explicit  1.141757e+01
2       danceability  1.597812e+01


In [10]:
# Feature Scaling
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Retrain the model on scaled data
retrained_model = LinearRegression()
retrained_model.fit(X_train_scaled, y_train)
retrained_y_pred = retrained_model.predict(X_val_scaled)
retrained_r2_val = r2_score(y_val, retrained_y_pred)
print(f"Retrained Validation R² score: {retrained_r2_val}")

retrained_coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': retrained_model.coef_})

print(retrained_coef_df.sort_values(by='coefficient', ascending=True))

Retrained Validation R² score: 0.2203909802065488
             feature  coefficient
8       acousticness    -4.172330
11           valence    -3.340417
9   instrumentalness    -2.547864
7        speechiness    -0.868185
10          liveness    -0.657660
13    time_signature     0.017024
6               mode     0.028598
4                key     0.047599
0        duration_ms     0.102153
12             tempo     0.237224
3             energy     0.609872
5           loudness     2.305768
1           explicit     2.345192
2       danceability     2.655595


In [11]:
artists_df = pd.read_csv("data/artists.csv")
tracks_df = pd.read_csv("data/tracks.csv")

tracks_df.dropna(inplace=True)
tracks_df.drop(columns=['name', 'release_date'], inplace=True, axis=1)

In [12]:
# New column: artist_popularity_median = median popularity of all tracks by the artist

# Parse id_artists
tracks_df['artists_id_list'] = tracks_df['id_artists'].apply(ast.literal_eval)
tracks_exploded = tracks_df.explode('artists_id_list')

tracks_exploded = tracks_exploded.rename(
    columns={'artists_id_list': 'artist_id'}
)
artists_df = artists_df.rename(columns={
   'id': 'artist_id',
   'popularity': 'artist_popularity'
})

tracks_enriched = tracks_exploded.merge(
   artists_df[['artist_id', 'artist_popularity']],
   on='artist_id',
   how='left'
)


In [13]:
tracks_enriched = tracks_enriched.dropna(subset=['artist_popularity'])
artist_popularity_by_track = (
   tracks_enriched
      .groupby('id')['artist_popularity']
      .median()
      .reset_index()
      .rename(columns={'artist_popularity': 'artist_popularity_median'})
)

In [None]:
tracks_final = tracks_df.merge(
   artist_popularity_by_track,
   on='id',
   how='left'
)
tracks_final.dropna(subset=['artist_popularity_median'], inplace=True)

In [23]:
X_final = tracks_final.drop(columns=['popularity','id','artists','id_artists', 'artists_id_list'])
y_final = tracks_final['popularity']

X_final_train,X_final_temp,y_final_train,y_final_temp = train_test_split(X_final, y_final, test_size=0.3, random_state=42)
X_final_val, X_final_test, y_final_val, y_final_test = train_test_split(X_final_temp, y_final_temp, test_size=0.5, random_state=42)

In [24]:
# New baseline model
final_mean_popularity = y_final_train.mean()
final_prediction_baseline = np.full(shape=y_final_val.shape, fill_value=final_mean_popularity)
final_r2_baseline = r2_score(y_final_val, final_prediction_baseline)
print(f"Final Baseline R² score: {final_r2_baseline}")

Final Baseline R² score: -1.213564594149119e-06


In [None]:
final_model = LinearRegression()
final_model.fit(X_final_train, y_final_train)
y_final_yal_pred = final_model.predict(X_final_val)
final_val_r2 = r2_score(y_final_val, y_final_yal_pred)
print(f"Final Validation R² score: {final_val_r2}")

Final Validation R² score: 0.451297228875325


In [33]:
y_final_test_pred = final_model.predict(X_final_test)
final_test_r2 = r2_score(y_final_test,y_final_test_pred)
mse = mean_squared_error(y_final_test, y_final_test_pred)
rmse = np.sqrt(mse)
print(f"Final RMSE: {rmse}")
print(f"Final R² score: {final_test_r2}")

Final RMSE: 13.538408421947688
Final R² score: 0.45214955643898436
