# 04_popularity_split_demo.ipynb

Exploratory Demo: Comparing High- vs Low-Popularity Spotify Tracks


In [None]:
import sys
from pathlib import Path

cwd = Path.cwd()
if (cwd / "src").is_dir():
    repo_root = cwd
elif (cwd.parent / "src").is_dir():
    repo_root = cwd.parent
else:
    raise RuntimeError("Couldn't locate project root (no `src/` folder found).")

sys.path.insert(0, str(repo_root))

import pandas as pd
import plotly.express as px

from src.preprocessing.utils import read_df_csv

pd.set_option("display.max_columns", 20)


## 1. Load High & Low Popularity Data


In [3]:
splits_dir = repo_root / "data" / "raw" / "popularity_splits"
high_path  = splits_dir / "high_popularity_spotify_data.csv"
low_path   = splits_dir / "low_popularity_spotify_data.csv"
df_high = pd.read_csv(high_path)
df_low  = pd.read_csv(low_path)
df_high["group"] = "high"
df_low["group"]  = "low"
df = pd.concat([df_high, df_low], ignore_index=True)
print("High-popularity shape:", df_high.shape)
print("Low-popularity  shape:", df_low.shape)

High-popularity shape: (1686, 30)
Low-popularity  shape: (3145, 30)


## 2. Distribution of Audio Features
Compare how “energy” and “danceability” differ by popularity group.


In [None]:
fig_energy = px.histogram(
    df,
    x="energy",
    color="group",
    barmode="overlay",
    nbins=30,
    title="Energy Distribution: High vs Low Popularity",
    labels={"energy": "Energy", "group": "Popularity Group"}
)
fig_energy.update_traces(opacity=0.6)
fig_energy.show()


In [None]:
fig_dance = px.histogram(
    df,
    x="danceability",
    color="group",
    barmode="overlay",
    nbins=30,
    title="Danceability Distribution: High vs Low Popularity",
    labels={"danceability": "Danceability", "group": "Popularity Group"}
)
fig_dance.update_traces(opacity=0.6)
fig_dance.show()


### 2.1 Summary Statistics


In [None]:
summary = df.groupby("group")[["energy","danceability","valence","tempo"]].mean().round(3)
summary


Unnamed: 0_level_0,energy,danceability,valence,tempo
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,0.667,0.65,0.526,121.071
low,0.544,0.607,0.458,116.767


## 3. Playlist Genre Comparison
Which playlist genres dominate high vs low popularity?


In [None]:
import pandas as pd
import plotly.express as px

def top_genres(df_grp: pd.DataFrame, n: int = 10) -> pd.DataFrame:
    vc = df_grp["playlist_genre"].value_counts().nlargest(n)
    return pd.DataFrame({
        "playlist_genre": vc.index,
        "count": vc.values
    })

high_top = top_genres(df_high, 10)
high_top["group"] = "high"

low_top  = top_genres(df_low,  10)
low_top["group"]  = "low"

df_genres = pd.concat([high_top, low_top], ignore_index=True)

fig_genres = px.bar(
    df_genres,
    x="playlist_genre",
    y="count",
    color="group",
    barmode="group",
    title="Top 10 Playlist Genres: High vs Low Popularity",
    labels={
        "playlist_genre": "Playlist Genre",
        "count": "Track Count",
        "group": "Popularity Group"
    }
)
fig_genres.update_layout(xaxis_tickangle=-45, bargap=0.2)
fig_genres.show()


## 4. Boxplots of Popularity Scores by Tempo Bucket
Explore whether higher-tempo tracks tend to be more popular.


In [None]:
bins = [0, 90, 120, 1e9]
labels = ["<90","90–120",">120"]
df["tempo_bucket"] = pd.cut(df["tempo"], bins=bins, labels=labels)

fig_box = px.box(
    df,
    x="tempo_bucket",
    y="track_popularity",
    color="group",
    title="Popularity by Tempo Bucket",
    labels={"tempo_bucket":"Tempo Bucket (BPM)","track_popularity":"Popularity"}
)
fig_box.show()


## 5. Conclusions

- **Energy & Danceability**: observe whether high-popularity tracks skew higher.  
- **Playlist Genres**: certain genres may correlate with popularity.  
- **Tempo Effects**: boxplots reveal whether faster or slower tracks tend to rate better.  
