# 01_api_ingestion.ipynb

Pulling all Spotify user data endpoints into tidy CSVs for downstream work.

In [None]:
import os, sys, json, logging
from pathlib import Path
notebooks_dir = Path.cwd()
repo_root     = notebooks_dir.parent
src_dir       = repo_root / "src"
sys.path.insert(0, str(src_dir))
import pandas as pd

from data_ingestion.spotify_client import SpotifyClient
from preprocessing.utils      import read_json_dir, read_json_file

logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger()

In [None]:
client = SpotifyClient()

2025-05-05 21:27:07,512 INFO data_ingestion.spotify_client ▶ Authenticated to Spotify with scope=user-top-read user-read-recently-played user-library-read playlist-read-private


In [None]:
!python src/data_ingestion/fetch_data.py --type top_tracks --time_range medium_term --limit 50
!python src/data_ingestion/fetch_data.py --type recently_played --limit 50

C:\Users\choks\AppData\Local\Programs\Python\Python311\python.exe: can't open file 'c:\\Users\\choks\\OneDrive\\Desktop\\spotify-wrapped-analysis\\notebooks\\src\\data_ingestion\\fetch_data.py': [Errno 2] No such file or directory
C:\Users\choks\AppData\Local\Programs\Python\Python311\python.exe: can't open file 'c:\\Users\\choks\\OneDrive\\Desktop\\spotify-wrapped-analysis\\notebooks\\src\\data_ingestion\\fetch_data.py': [Errno 2] No such file or directory


In [None]:
!python src/data_ingestion/fetch_data.py --type saved_tracks --limit 50
!python src/data_ingestion/fetch_data.py --type user_profile
!python src/data_ingestion/fetch_data.py --type user_playlists --limit 20

C:\Users\choks\AppData\Local\Programs\Python\Python311\python.exe: can't open file 'c:\\Users\\choks\\OneDrive\\Desktop\\spotify-wrapped-analysis\\notebooks\\src\\data_ingestion\\fetch_data.py': [Errno 2] No such file or directory
C:\Users\choks\AppData\Local\Programs\Python\Python311\python.exe: can't open file 'c:\\Users\\choks\\OneDrive\\Desktop\\spotify-wrapped-analysis\\notebooks\\src\\data_ingestion\\fetch_data.py': [Errno 2] No such file or directory
C:\Users\choks\AppData\Local\Programs\Python\Python311\python.exe: can't open file 'c:\\Users\\choks\\OneDrive\\Desktop\\spotify-wrapped-analysis\\notebooks\\src\\data_ingestion\\fetch_data.py': [Errno 2] No such file or directory


In [None]:
raw_dir = repo_root / "data" / "raw" / "spotify_api"
for p in sorted(raw_dir.glob("*.json")):
    print(p.name)

audio_features_50_20250429_191806.json
audio_features_50_20250429_194059.json
audio_features_50_20250429_194156.json
recently_played_50_20250429_153854.json
recently_played_50_20250505_204855.json
saved_tracks_50_0_20250429_153902.json
saved_tracks_50_0_20250505_204856.json
top_tracks_medium_term_50_0_20250429_153843.json
top_tracks_medium_term_50_0_20250505_204854.json
user_playlists_20_0_20250429_153912.json
user_playlists_20_0_20250505_204857.json
user_profile_20250429_153512.json
user_profile_20250505_204857.json


In [None]:
top_blobs = read_json_dir(raw_dir, pattern="top_tracks_*.json")
records = []
for blob in top_blobs:
    for item in blob.get("items", []):
        records.append({
            "track_id": item["id"],
            "track_name": item["name"],
            "album": item["album"]["name"],
            "album_date": item["album"]["release_date"],
            "popularity": item["popularity"],
            "explicit": item["explicit"],
            "duration_ms": item["duration_ms"],
            "artists": ", ".join(a["name"] for a in item["artists"])
        })
df_top = pd.DataFrame(records)
df_top.to_csv(repo_root/"data"/"interim"/"top_tracks.csv", index=False)
logger.info("Saved top_tracks.csv (%d rows)", len(df_top))
df_top.head()

2025-05-05 21:27:44,123 INFO root ▶ Saved top_tracks.csv (100 rows)


Unnamed: 0,track_id,track_name,album,album_date,popularity,explicit,duration_ms,artists
0,1jKXjxMWlq4BhH6f9GtZbu,TORE UP,HARDSTONE PSYCHO,2024-06-14,83,True,126986,Don Toliver
1,3vkCueOmm7xQDoJ17W1Pm3,My Love Mine All Mine,The Land Is Inhospitable and So Are We,2023-09-15,89,False,137773,Mitski
2,3xgA3KSsd8mt3UjQxNtQy3,Bajrang Baan-Lofi,Bajrang Baan-Lofi,2023-01-05,72,False,218009,Rasraj Ji Maharaj
3,6J4oLY2GEwOsUgEd50IpKy,Baarish Ka Asar,Baarish Ka Asar,2020-12-09,53,False,245500,Twin Strings
4,0Qa9pTZLUC95wJCHGYMIg4,Sajdaa,My Name Is Khan (Original Motion Picture Sound...,2010,66,False,365706,"Shankar-Ehsaan-Loy, Rahat Fateh Ali Khan, Shan..."


In [None]:
recent_blobs = read_json_dir(raw_dir, pattern="recently_played_*.json")
records = []
for blob in recent_blobs:
    for item in blob.get("items", []):
        t = item["track"]
        records.append({
            "played_at": item["played_at"],
            "track_id": t["id"],
            "track_name": t["name"],
            "album": t["album"]["name"],
            "album_date": t["album"]["release_date"],
            "artists": ", ".join(a["name"] for a in t["artists"])
        })
df_recent = pd.DataFrame(records)
df_recent.to_csv(repo_root/"data"/"interim"/"recently_played.csv", index=False)
logger.info("Saved recently_played.csv (%d rows)", len(df_recent))
df_recent.head()

2025-05-05 21:30:39,930 INFO root ▶ Saved recently_played.csv (100 rows)


Unnamed: 0,played_at,track_id,track_name,album,album_date,artists
0,2025-04-28T17:20:59.760Z,4RoKNqyZ9622tcAeNPNv5k,City Boys,I Told Them...,2023-08-24,Burna Boy
1,2025-04-27T18:48:01.766Z,2p8IUWQDrpjuFltbdgLOag,After Hours,After Hours,2020-03-20,The Weeknd
2,2025-04-27T18:43:46.393Z,2cYqizR4lgvp4Qu6IQ3qGN,BUTTERFLY EFFECT,ASTROWORLD,2018-08-03,Travis Scott
3,2025-04-27T18:40:35.254Z,6LyAwkJsHlW7RQ8S1cYAtM,Overdue (with Travis Scott),NOT ALL HEROES WEAR CAPES (Deluxe),2018-11-06,"Metro Boomin, Travis Scott"
4,2025-04-27T18:37:48.684Z,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,ASTROWORLD,2018-08-03,Travis Scott


In [None]:
saved_blobs = read_json_dir(raw_dir, pattern="saved_tracks_*.json")
records = []
for blob in saved_blobs:
    for item in blob.get("items", []):
        t = item["track"]
        records.append({
            "saved_at": item["added_at"],
            "track_id": t["id"],
            "track_name": t["name"],
            "album": t["album"]["name"],
            "album_date": t["album"]["release_date"],
            "artists": ", ".join(a["name"] for a in t["artists"])
        })
df_saved = pd.DataFrame(records)
df_saved.to_csv(repo_root/"data"/"interim"/"saved_tracks.csv", index=False)
logger.info("Saved saved_tracks.csv (%d rows)", len(df_saved))
df_saved.head()

2025-04-29 16:46:03,229 INFO root ▶ Saved saved_tracks.csv (50 rows)


Unnamed: 0,saved_at,track_id,track_name,album,album_date,artists
0,2025-04-28T17:20:58Z,4RoKNqyZ9622tcAeNPNv5k,City Boys,I Told Them...,2023-08-24,Burna Boy
1,2025-04-28T17:20:15Z,4v5kAh2wWyCSuKuhMJK8u6,Long Way 2 Go,Cassie (U.S. Version),2006-08-07,Cassie
2,2025-04-27T16:45:15Z,4LMlVCXHJtCE9abhmn0mYo,Pal Pal,Pal Pal,2025-02-17,"Afusic, AliSoomroMusic"
3,2025-04-24T03:22:48Z,2S7RApTsKT0CtYojYq2cKz,L'italiano,L'Italiano,1983-02-01,Toto Cutugno
4,2025-04-17T19:18:17Z,5fBghXeYCGIEVuExKytoJ9,Fell For You,Sicario,2025-01-17,Shubh


In [None]:
profile_files = list(raw_dir.glob("user_profile_*.json"))
profile = read_json_file(profile_files[-1])
df_profile = pd.json_normalize(profile)

out_csv = repo_root/"data"/"interim"/"user_profile.csv"
df_profile.to_csv(out_csv, index=False)
logger.info(f"Saved user_profile.csv to {out_csv}")

print("Available profile fields:\n", df_profile.columns.tolist())

desired = ["display_name","id","country","followers.total","product"]
available = [c for c in desired if c in df_profile.columns]
print("Showing these fields:\n", available)
display(df_profile[available].T)

2025-05-05 21:30:45,059 INFO root ▶ Saved user_profile.csv to c:\Users\choks\OneDrive\Desktop\spotify-wrapped-analysis\data\interim\user_profile.csv


Available profile fields:
 ['country', 'display_name', 'href', 'id', 'images', 'product', 'type', 'uri', 'explicit_content.filter_enabled', 'explicit_content.filter_locked', 'external_urls.spotify', 'followers.href', 'followers.total']
Showing these fields:
 ['display_name', 'id', 'country', 'followers.total', 'product']


Unnamed: 0,0
display_name,Jeet
id,31cny4wvswa3zmq25ccg6w2masi4
country,US
followers.total,0
product,free


In [None]:
pl_files = list(raw_dir.glob("user_playlists_*.json"))
pl_blob = read_json_file(pl_files[-1])

records = []
for item in pl_blob.get("items", []):
    records.append({
        "playlist_id":   item.get("id"),
        "name":          item.get("name"),
        "owner":         item.get("owner", {}).get("display_name"),
        "public":        item.get("public"),
        "tracks_total":  item.get("tracks", {}).get("total")
    })

df_playlists = pd.DataFrame(records)
out_path = repo_root/"data"/"interim"/"user_playlists.csv"
df_playlists.to_csv(out_path, index=False)
logger.info(f"Saved user_playlists.csv ({len(df_playlists)} rows) to {out_path}")
df_playlists.head()

2025-04-29 16:50:38,053 INFO root ▶ Saved user_playlists.csv (2 rows) to c:\Users\choks\OneDrive\Desktop\spotify-wrapped-analysis\data\interim\user_playlists.csv


Unnamed: 0,playlist_id,name,owner,public,tracks_total
0,5R402c0QrsyaHUwZFwEmi9,Bhagwan,Jeet,True,5
1,7oBzvnRfjDOY3ZN78ejrME,SEXXX,Jeet,False,2


---

**All endpoints ingested!**  
Check your `data/interim/` folder for:
- `top_tracks.csv`
- `recently_played.csv`
- `saved_tracks.csv`
- `user_profile.csv`
- `user_playlists.csv`
- `playlist_tracks_top3.csv`

You’re now ready for feature engineering and EDA on your complete user history.
