# 02_wrapped_eda.ipynb

Exploratory Data Analysis on your “Spotify Wrapped” data:
- Profile summary  
- Top tracks & listen counts  
- Temporal listening patterns  
- Saved songs overview  
- Playlist usage  


In [1]:
import plotly.io as pio
# open the chart in your browser instead of inline
pio.renderers.default = "browser"


In [2]:
# Cell 1 — Setup imports & paths
import sys
from pathlib import Path

# Ensure src/ is on the path for any shared utilities
repo_root = Path.cwd()
sys.path.append(str(repo_root / "src"))

import pandas as pd
import plotly.express as px

# Notebook display settings
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 10)

# Convenience
interim = repo_root / "data" / "interim"


In [3]:
# ─── cell 1: bootstrap the repo so `import src…` works ──────────────────────
import sys
from pathlib import Path

# assume this notebook lives in <project_root>/notebooks/
project_root = Path.cwd().parent   # up one from notebooks/
sys.path.insert(0, str(project_root))

print("📁 project root is:", project_root)
print("✅ src on PYTHONPATH? →", (project_root / "src").exists())

# now you can import your utils
from src.preprocessing.utils import get_project_root


📁 project root is: c:\Users\choks\OneDrive\Desktop\spotify-wrapped-analysis
✅ src on PYTHONPATH? → True


In [4]:
# ─── cell 2: load all your interim CSVs ────────────────────────────────────
proj_root = get_project_root()
interim   = proj_root / "data" / "interim"

df_top       = pd.read_csv(interim / "top_tracks.csv",     parse_dates=["album_date"])
df_recent    = pd.read_csv(interim / "recently_played.csv", parse_dates=["played_at"])
df_saved     = pd.read_csv(interim / "saved_tracks.csv",    parse_dates=["saved_at"])
df_profile   = pd.read_csv(interim / "user_profile.csv")
df_playlists = pd.read_csv(interim / "user_playlists.csv")
df_pl_tracks = pd.read_csv(interim / "playlist_tracks_top3.csv", parse_dates=["added_at"])


### 1. User Profile
Basic account info fetched once.


In [5]:
# Cell 3 — Display key profile fields
profile = df_profile.iloc[0]
display(pd.DataFrame(profile).rename(columns={0: "value"}))

Unnamed: 0,value
country,US
display_name,Jeet
href,https://api.spotify.com/v1/users/31cny4wvswa3z...
id,31cny4wvswa3zmq25ccg6w2masi4
images,[]
...,...
explicit_content.filter_enabled,False
explicit_content.filter_locked,False
external_urls.spotify,https://open.spotify.com/user/31cny4wvswa3zmq2...
followers.href,


### 2. Top Tracks
Your all-time top tracks (by Spotify’s popularity) captured in the period.


In [6]:
# Cell 4 — Shape & sample
print("Top tracks count:", len(df_top))
df_top.head()


Top tracks count: 100


Unnamed: 0,track_id,track_name,album,album_date,popularity,explicit,duration_ms,artists
0,1jKXjxMWlq4BhH6f9GtZbu,TORE UP,HARDSTONE PSYCHO,2024-06-14,83,True,126986,Don Toliver
1,3vkCueOmm7xQDoJ17W1Pm3,My Love Mine All Mine,The Land Is Inhospitable and So Are We,2023-09-15,89,False,137773,Mitski
2,3xgA3KSsd8mt3UjQxNtQy3,Bajrang Baan-Lofi,Bajrang Baan-Lofi,2023-01-05,72,False,218009,Rasraj Ji Maharaj
3,6J4oLY2GEwOsUgEd50IpKy,Baarish Ka Asar,Baarish Ka Asar,2020-12-09,53,False,245500,Twin Strings
4,0Qa9pTZLUC95wJCHGYMIg4,Sajdaa,My Name Is Khan (Original Motion Picture Sound...,2010,66,False,365706,"Shankar-Ehsaan-Loy, Rahat Fateh Ali Khan, Shan..."


In [7]:
pip install nbformat

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Cell 5 — Top 10 by Spotify popularity
top10 = df_top.nlargest(10, "popularity")
px.bar(
    top10,
    x="track_name",
    y="popularity",
    text="popularity",
    title="Your Top 10 Tracks (by Spotify Popularity)",
).update_layout(xaxis_tickangle=-45)


### 3. Listening History
Analysis of your recent playback events.


In [9]:
# Cell 6 — Basic stats
print("Total plays recorded:", len(df_recent))
print("Unique tracks played:", df_recent["track_id"].nunique())
print("Unique artists:", df_recent["artists"].nunique())


Total plays recorded: 100
Unique tracks played: 56
Unique artists: 50


In [10]:
# Cell 7 — Most-played tracks in this sample
counts = df_recent["track_name"].value_counts().reset_index()
counts.columns = ["track_name","plays"]
top_played = counts.head(10)
px.bar(
    top_played,
    x="track_name",
    y="plays",
    text="plays",
    title="Top 10 Most Played Tracks (recently_played)",
).update_layout(xaxis_tickangle=-45)


#### 3.1 Time-of-Day & Day-of-Week Patterns


In [11]:
# Cell 8 — Hour of day distribution
df_recent["hour"] = df_recent["played_at"].dt.hour
px.histogram(
    df_recent,
    x="hour",
    nbins=24,
    title="Listening by Hour of Day",
    labels={"hour":"Hour (0–23)"}
)


In [12]:
# Cell — Listening by Day of Week (fixed)
import plotly.express as px

# Define the weekday order
order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

# 1) Extract day names and count, reindex to ensure correct order
counts = (
    df_recent["played_at"]
    .dt.day_name()
    .value_counts()
    .reindex(order)
)

# 2) Turn that Series into a DataFrame with unique columns
df_counts = counts.rename_axis("weekday").reset_index(name="count")

# 3) Plot
fig = px.bar(
    df_counts,
    x="weekday",
    y="count",
    title="Listening by Day of Week",
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


#### 3.2 Monthly Trend


In [13]:
# Cell 10 — Plays per month
df_recent["month"] = df_recent["played_at"].dt.to_period("M").dt.to_timestamp()
monthly = df_recent.groupby("month").size().reset_index(name="plays")
px.line(
    monthly,
    x="month",
    y="plays",
    title="Monthly Total Plays",
    markers=True
)



Converting to PeriodArray/Index representation will drop timezone information.



### 4. Saved (“Liked”) Songs


In [14]:
# Cell 11 — Saved tracks overview
print("Total saved songs:", len(df_saved))
print("Distinct saved tracks:", df_saved["track_id"].nunique())


Total saved songs: 50
Distinct saved tracks: 50


In [15]:
# Cell 12 — Top 10 Saved Artists (fixed)
import plotly.express as px

# 1) Explode your “artists” strings into one artist per row
all_artists = df_saved["artists"].str.split(", ").explode()

# 2) Count and take the top 10
top10 = all_artists.value_counts().head(10)

# 3) Turn that Series into a clean DataFrame with unique column names
df_artists = top10.rename_axis("artist").reset_index(name="count")

# 4) Plot!
fig = px.bar(
    df_artists,
    x="artist",
    y="count",
    text="count",
    title="Top 10 Artists in Your Liked Songs"
)
fig.update_layout(xaxis_tickangle=-45, margin=dict(l=40, r=40, t=50, b=40))
fig.show()


### 5. Playlists & Playlist Contents


In [16]:
# Cell 13 — Playlists summary
print("Total playlists fetched:", len(df_playlists))
df_playlists.head()


Total playlists fetched: 2


Unnamed: 0,playlist_id,name,owner,public,tracks_total
0,5R402c0QrsyaHUwZFwEmi9,Bhagwan,Jeet,True,5
1,7oBzvnRfjDOY3ZN78ejrME,SEXXX,Jeet,False,2


In [17]:
# Cell 14 — Tracks per playlist (top3 sampled)
pl_counts = df_pl_tracks["playlist_id"].value_counts().reset_index()
pl_counts.columns = ["playlist_id","tracks_sampled"]
# Join to get playlist names
pl_counts = pl_counts.merge(df_playlists, on="playlist_id", how="left")
px.bar(
    pl_counts,
    x="name",
    y="tracks_sampled",
    title="Number of Tracks Pulled per Playlist (Top 3)",
    text="tracks_sampled"
).update_layout(xaxis_tickangle=-45)


---

✅ **EDA complete!**  
You now have insights into your profile, top tracks, listening patterns, saved songs, and playlists.  
Next up: feature engineering and modeling.
