In [11]:
import os
import json
import pandas as pd

# Paths
mpd_path = "spotify_million_playlist_dataset/data/"
features_csv_path = "tracks_features/tracks_features.csv"

# Load track features
features_df = pd.read_csv(features_csv_path)
print(f"✅ Loaded track features: {features_df.shape[0]} rows")

# Get first 500 JSON files only
json_files = sorted([f for f in os.listdir(mpd_path) if f.endswith(".json")])[:100]
print(f"✅ Will process {len(json_files)} playlist files")

# Read and flatten playlist data
rows = []
for idx, file in enumerate(json_files):
    print(f"📁 {idx+1}/{len(json_files)}: {file}")
    with open(os.path.join(mpd_path, file), "r") as f:
        data = json.load(f)
        for playlist in data["playlists"]:
            pid = playlist["pid"]
            for track in playlist["tracks"]:
                rows.append({
                    "pid": pid,
                    "track_uri": track["track_uri"],
                    "track_name": track["track_name"],
                    "artist_name": track["artist_name"]
                })

# Convert to DataFrame
playlist_df = pd.DataFrame(rows)

# Clean track_uri for merging
playlist_df["track_id"] = playlist_df["track_uri"].str.replace("spotify:track:", "", regex=False)

# Merge with track features
merged_df = pd.merge(playlist_df, features_df, left_on="track_id", right_on="id")
print(f"✅ Merged dataset size: {merged_df.shape[0]} rows")

# Save to CSV
merged_df.to_csv("new_features.csv", index=False)
print("✅ Saved to new_features.csv")


✅ Loaded track features: 1204025 rows
✅ Will process 100 playlist files
📁 1/100: mpd.slice.0-999.json
📁 2/100: mpd.slice.1000-1999.json
📁 3/100: mpd.slice.10000-10999.json
📁 4/100: mpd.slice.100000-100999.json
📁 5/100: mpd.slice.101000-101999.json
📁 6/100: mpd.slice.102000-102999.json
📁 7/100: mpd.slice.103000-103999.json
📁 8/100: mpd.slice.104000-104999.json
📁 9/100: mpd.slice.105000-105999.json
📁 10/100: mpd.slice.106000-106999.json
📁 11/100: mpd.slice.107000-107999.json
📁 12/100: mpd.slice.108000-108999.json
📁 13/100: mpd.slice.109000-109999.json
📁 14/100: mpd.slice.11000-11999.json
📁 15/100: mpd.slice.110000-110999.json
📁 16/100: mpd.slice.111000-111999.json
📁 17/100: mpd.slice.112000-112999.json
📁 18/100: mpd.slice.113000-113999.json
📁 19/100: mpd.slice.114000-114999.json
📁 20/100: mpd.slice.115000-115999.json
📁 21/100: mpd.slice.116000-116999.json
📁 22/100: mpd.slice.117000-117999.json
📁 23/100: mpd.slice.118000-118999.json
📁 24/100: mpd.slice.119000-119999.json
📁 25/100: mpd.sli