In [None]:
import spotipy
import json
import time
import numpy as np
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from getpass import getpass
from itertools import groupby
from operator import itemgetter

In [None]:
# Create Spotify client

client_id = getpass(prompt='Enter Spotify Client ID: ')
client_secret = getpass(prompt='Enter Spotify Client Secret: ')

spotify_client = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [None]:
genres = spotify_client.recommendation_genre_seeds()['genres']
len(genres)

In [None]:
# Helpers functions

def extract_album_data(album):
    return {
        'id': album['id'],
        'name': album['name'],
        'release_date': album['release_date'],
        'total_tracks': album['total_tracks'],
        'artists': [(artist['id'], artist['name']) for artist in album['artists']]
    }


def extract_artists_data(artists):
    return [(artist['id'], artist['name']) for artist in artists]


def try_extract_data(item):
    try:
        return {
            'id': item['id'],
            'name': item['name'],
            'album': extract_album_data(item['album']),
            'artists': extract_artists_data(item['artists'])
        }
    except:
        return {}

In [None]:
def search_genre_with_retry(genre, max_retries=5, backoff_factor=0.1):
    for i in range(max_retries):
        try:
            return spotify_client.search(q=f'genre:{genre}', type='track')['tracks']
        except Exception as e:
            print(f"Could not get results when searching genre'{genre}'. Trying again.")
            pass
        if i < max_retries - 1:
            delay = backoff_factor * (2 ** i)
            time.sleep(delay)
    
    print(f"Max retries exceeded. Ignoring genre '{genre}'.")
    return None


def next_with_retry(genre, results, max_retries=5, backoff_factor=0.1):
    for i in range(max_retries):
        try:
            return spotify_client.next(results)['tracks']
        except Exception as e:
            print(f"Could not get next page from genre '{genre}'. Current offset: '{results['offset']}'. Trying again.")
            pass
        if i < max_retries - 1:
            delay = backoff_factor * (2 ** i)
            time.sleep(delay)
    
    print(f"Max retries exceeded. Ignoring genre '{genre}' at offset '{results['offset'] + 50}'.")
    return None


def get_all_tracks_by_genre(genre):
    results = search_genre_with_retry(genre)
    if results == None:
        return []

    tracks = [try_extract_data(item) for item in results['items']]
    while results['next'] != None:
        results = next_with_retry(genre, results)
        if results == None:
            break
        
        tracks += [try_extract_data(item) for item in results['items']]
    
    return tracks

In [None]:
songs = []

for genre in genres:
    songs += get_all_tracks_by_genre(genre)

len(songs)

In [None]:
# Save songs json into song_data.json

with open('song_data.json', 'w') as f:
    json.dump(songs, f)

In [None]:
# Load song_data.json into songs

with open("song_data.json", "r") as f:
    songs = json.load(f)

len(songs)

In [None]:
# Select unique songs

songs.sort(key=itemgetter('id'))
songs = [next(g) for _, g in groupby(songs, key=itemgetter('id'))]
len(songs)

In [None]:
# There are too many songs so we split the array in 1000

song_groups = np.array_split(songs, 1000, axis=0)

In [None]:
# Helper functions

def get_audio_features_with_retry(group_number, ids, max_retries=5, backoff_factor=0.1):
    for i in range(max_retries):
        try:
            return spotify_client.audio_features(ids)
        except Exception as e:
            print(f"Could not get audio features from group '{group_number}'. Trying again.")
            pass
        if i < max_retries - 1:
            delay = backoff_factor * (2 ** i)
            time.sleep(delay)
    
    print(f"Max retries exceeded. Ignoring group '{group_number}.")
    return None

In [None]:
# Enrich songs with their Audio features

for group_number in range(len(song_groups)):
    ids = [song['id'] for song in song_groups[group_number]]
    audio_features = get_audio_features_with_retry(group_number, ids)
    if audio_features == None:
        continue

    song_groups[group_number] = [{**a, 'audio_features': b} for a, b in zip(song_groups[group_number], audio_features)]

In [None]:
# Merge split songs lists again

songs = np.concatenate(song_groups)
len(songs)

In [None]:
# Save songs into enriched_song_data.json

songs = list(songs)
with open('enriched_song_data.json', 'w') as f:
    json.dump(songs, f)

In [None]:
songs_data = pd.json_normalize(songs)

In [None]:
songs_data.to_csv('songs_data.csv')