In [None]:
import json
import time

import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [None]:
# Must have SPOTIPY_CLIENT_ID and SPOTIPY_CLIENT_SECRET EV's set
# https://spotipy.readthedocs.io/en/2.19.0/?highlight=audio#client-credentials-flow
auth_manager = SpotifyClientCredentials()
sp = spotipy.Spotify(
    auth_manager=auth_manager, 
    requests_timeout=10,
    retries=10,
    backoff_factor=3,
)

In [None]:
mpd_path = "/opt/cs6242_home/mpd.slice.0-999.json"
with open(mpd_path, "r") as f:
    data = json.load(f)        

In [None]:
track_uris = []
spotify_features = {}
for playlist in data['playlists']:
    for song in playlist['tracks']:
        track_uri = song["track_uri"]
        track_uris.append(track_uri)
        spotify_features[track_uri] = {
            "artist_name": song["artist_name"],
            "track_name": song["track_name"],
            "album_name": song["album_name"],
        }
        
# We only care about the unique track_uris
track_uris = list(set(track_uris))

# spotipy audio_features() api call has a max. of 100 track_uris that can be requested per API call
num_chunks = np.ceil(len(track_uris) / 100)
chunked_track_uris = np.array_split(track_uris, num_chunks)

In [None]:
from requests.exceptions import ReadTimeout

def get_audio_features(track_uris, audio_trk_features):
    try:
        audio_features = sp.audio_features(track_uris)
        for idx, track_uri in enumerate(track_uris):
            audio_trk_features[track_uri] = {
                'danceability': audio_features[idx]['danceability'],
                'energy': audio_features[idx]['energy'],
                'key': audio_features[idx]['key'],
                'loudness': audio_features[idx]['loudness'],
                'mode': audio_features[idx]['mode'],
                'speechiness': audio_features[idx]['speechiness'],
                'acousticness': audio_features[idx]['acousticness'],
                'instrumentalness': audio_features[idx]['instrumentalness'],
                'liveness': audio_features[idx]['liveness'],
                'valence': audio_features[idx]['valence'],
                'tempo': audio_features[idx]['tempo'],
                'duration_ms': audio_features[idx]['duration_ms'],
                'time_signature': audio_features[idx]['time_signature'],
            }
        return audio_trk_features
    # This is pretty hacky but feels safe enough since it's only catching on ReadTimeout
    except ReadTimeout:
        return get_audio_features(track_uris, audio_trk_features)

In [None]:
audio_track_features = {}
for chunk in chunked_track_uris:
    audio_track_features = get_audio_features(chunk, audio_track_features)

In [None]:
audio_df = pd.DataFrame.from_dict(audio_track_features, orient='index')
spotify_df = pd.DataFrame.from_dict(spotify_features, orient='index')
combined_df = spotify_df.join(audio_df)

In [None]:
combined_df.head()

In [None]:
# combined_df.to_json ("/opt/cs6242_home/mpd.slice.0-999_with_audio_features.json")
combined_dict = combined_df.T.to_dict()
with open("/opt/cs6242_home/mpd.slice.0-999_with_audio_features.json", "w") as f:
    json.dump(combined_dict, f, indent=4)