# Get Audio Features from Spotify API


In this notebook, we retrieve audio features from Spotify API for tracks in the final dataset.
We get features we didn't retrieved during the first phase of preprocessing.  

In [3]:
import pandas as pd
import numpy as np
import json
import random
import base64
import requests
import json

## 1. PLAYLIST DATA : Get preview_url, image of the artist, popularity of the song 

First, we define functions to make the get requests.

In [8]:
#First authentication
def get_tokens() : 
    """
    Get access token for Spotify API
    You need to create a Spotify app (https://developer.spotify.com/dashboard) to get the client_id and client_secret.
    Then create a file named keys.json with the following structure:
    {
        "client_id": "your_client_id",
        "client_secret": "your_client_secret"
    }
    And store it in the same folder as this notebook.

    Returns:
        access_token (str): access token for Spotify API
    """
    with open('data/keys.json', 'r') as f:
        keys = json.load(f)

    client_id = keys['client_id']
    client_secret = keys['client_secret']

    client_creds = f"{client_id}:{client_secret}"
    # Encode the concatenated string as base64
    client_creds_b64 = base64.b64encode(client_creds.encode())

    # Define the headers for the request to the Spotify Accounts service
    token_url = 'https://accounts.spotify.com/api/token'
    token_data = {
        "grant_type": "client_credentials"
    }
    token_headers = {
        "Authorization": f"Basic {client_creds_b64.decode()}",
    }

    # Make the request to the Spotify Accounts service to get an access token
    response = requests.post(token_url, data=token_data, headers=token_headers)

    # Extract the access token from the response
    access_token = response.json()["access_token"]

    return access_token

def get_audio_features(track_ids, access_token):
    """
    Get audio features from Spotify API for a list of track IDs

    Args:
        track_ids (list): List of Spotify track IDs
        access_token (str): Access token for Spotify API
    
    Returns:
        features_list (list): List of dictionaries containing the audio features for each track
    """
    
    # Spotify API endpoint for getting audio features
    url = 'https://api.spotify.com/v1/tracks'

    # header for the request with authorization token
    headers = {
        'Authorization': 'Bearer ' + access_token
    }

    # convert list of track IDs to comma-separated string
    track_ids_str = ','.join(track_ids)

    # parameters for the request to the Spotify API
    params = {
        'ids': track_ids_str,
        'market': 'US'
    }

    # make the request to the Spotify API
    response = requests.get(url, headers=headers, params=params)
    # check if the request was successful
    if response.status_code == 200:
        # get the JSON response content
        response_json = json.loads(response.content)
        # extract the audio features for each track
        features_list = []
        for track_features in response_json['tracks']:
            if len(track_features['album']['images']) == 0:
                continue
            features = {
                'track_id': track_features['id'],
                'preview_url': track_features['preview_url'],
                'image_url': track_features['album']['images'][0]['url'],
                'popularity': track_features['popularity'],
            }
            features_list.append(features)
        return features_list
    else:
        print('Error:', response.status_code)

def retrieve_features_batch(list_ids, access_token, id_batch, last=False, batch_size=50):
    """
    Retrieve audio features for a batch of tracks from the Spotify API

    Args:
        tracks (pd.DataFrame): Dataframe containing the tracks
        access_token (str): Access token for Spotify API
        id_batch (int): Batch number
        last (bool): True if it is the last batch
        batch_size (int): Number of tracks per batch
    
    Returns:
        merged_df (pd.DataFrame): Dataframe containing the tracks and their audio features
    """ 
    
    #Keep a subset of rows from the tracks dataframe
    if last:
        ids = list_ids[id_batch*batch_size:]
    else:
        ids = list_ids[id_batch*batch_size:(id_batch+1)*batch_size]

    #Get the audio features for the tracks
    features_list = get_audio_features(ids, access_token)

    #Convert the list of dictionaries to a dataframe
    features_df = pd.DataFrame(features_list)

    #Merge the audio features with the tracks dataframe on track_id
    return features_df
            

def retrieve_features(list_ids):
    """
    Retrieve audio features for all tracks

    Args:
        tracks_df : Dataframe of all tracks

    Returns:
        tracks_final : Dataframe containing the tracks and their audio features
    """
    #Retrieve access_token 
    access_token = get_tokens()

    #Get the number of batches
    nb_batches = len(list_ids)//50

    #Initialize empty dataframe
    tracks_final = None

    #Retrieve audio features for each batch
    for i in range(nb_batches):
        print('Batch {i}/{nb}'.format(i=i, nb=nb_batches+1))
        try : 
            tracks_batch = retrieve_features_batch(list_ids, access_token, i)
        except TypeError:
            print('Error in batch {i}'.format(i=i))
            continue

        if i == 0:
            tracks_final = tracks_batch
        else:
            tracks_final = pd.concat([tracks_final, tracks_batch], ignore_index=True)    

    return tracks_final

Now, we retrieve the features we are interested in. 

In [11]:
df = pd.read_csv('data/tracks_final.csv')
access_token = get_tokens()
tracks_ids = df['track_id'].tolist()

df_spotify = retrieve_features(tracks_ids)

Batch 0/260
Batch 1/260
Batch 2/260
Batch 3/260
Batch 4/260
Batch 5/260
Batch 6/260
Batch 7/260
Batch 8/260
Batch 9/260
Batch 10/260
Batch 11/260
Batch 12/260
Batch 13/260
Batch 14/260
Batch 15/260
Batch 16/260
Batch 17/260
Batch 18/260
Batch 19/260
Batch 20/260
Batch 21/260
Batch 22/260
Batch 23/260
Batch 24/260
Batch 25/260
Batch 26/260
Batch 27/260
Batch 28/260
Batch 29/260
Batch 30/260
Batch 31/260
Batch 32/260
Batch 33/260
Batch 34/260
Batch 35/260
Batch 36/260
Batch 37/260
Batch 38/260
Batch 39/260
Batch 40/260
Batch 41/260
Batch 42/260
Batch 43/260
Batch 44/260
Batch 45/260
Batch 46/260
Batch 47/260
Batch 48/260
Batch 49/260
Batch 50/260
Batch 51/260
Batch 52/260
Batch 53/260
Batch 54/260
Batch 55/260
Batch 56/260
Batch 57/260
Batch 58/260
Batch 59/260
Batch 60/260
Batch 61/260
Batch 62/260
Batch 63/260
Batch 64/260
Batch 65/260
Batch 66/260
Batch 67/260
Batch 68/260
Batch 69/260
Batch 70/260
Batch 71/260
Batch 72/260
Batch 73/260
Batch 74/260
Batch 75/260
Batch 76/260
Batch 77/

In [13]:
#Merge to the initial datframe
df_final = pd.merge(df, df_spotify, on='track_id', how='left')

#Check how many nan values there are on column preview_url
print(df_final['preview_url'].isna().sum())
print(df_final['genre'].isna().sum())

#Drop rows with nan values on column preview_url
df_final = df_final.dropna(subset=['preview_url'])
df_final = df_final.dropna(subset=['genre'])

df_final

5801
522


Unnamed: 0,id,emotions,track,artist,tags,arousal,dominance,track_id,genre,acousticness,...,liveness,loudness,mode,speechiness,tempo,time_signature,valence,preview_url,image_url,popularity
0,4710,anticipation,Waving My Arms In The Air [Take 1],Syd Barrett,['ominous' 'fractured' 'insular' 'wry' 'eccent...,3.620607,4.231495,0pMOAZz9GxlXi2fXkRr0nN,psychedelic rock,0.793000,...,0.1490,-17.179,1,0.0430,105.726,4,0.4620,https://p.scdn.co/mp3-preview/d94c05e855b75f7b...,https://i.scdn.co/image/ab67616d0000b273e16120...,13.0
1,14603,anticipation,Uni Iso,Alva Noto,['intimate' 'nervous'],5.850000,5.533333,158gPbiLX3MUEptQOJgQES,glitch,0.000015,...,0.0513,-17.213,1,0.0528,127.642,4,0.0567,https://p.scdn.co/mp3-preview/8b9cdbe7caaba45b...,https://i.scdn.co/image/ab67616d0000b273f6b517...,4.0
3,2940,anticipation,Nobody Loves You Like I Do,MakTub,['harsh' 'urgent' 'yearning'],3.976000,4.889000,6C7NKesRR4mN3Dr4goQHlh,singer-songwriter,0.030000,...,0.1520,-8.650,1,0.0279,111.080,4,0.5050,https://p.scdn.co/mp3-preview/f038a8eb1f7c886c...,https://i.scdn.co/image/ab67616d0000b273b14263...,2.0
4,4067,anticipation,Harem Scarem,Focus,['manic' 'eerie' 'urgent' 'campy' 'passionate'],4.432000,3.902000,0QhfAl5OwRfnBfqLlV3b6N,progressive rock,0.022800,...,0.1140,-9.591,1,0.0342,88.484,4,0.6940,https://p.scdn.co/mp3-preview/a9d1a681a79af067...,https://i.scdn.co/image/ab67616d0000b2733aaa4d...,26.0
5,5160,anticipation,Song Slowly Song,Tim Buckley,['visceral' 'eerie' 'dramatic' 'mysterious' 'p...,4.466847,4.379754,5fbNnnlxmbzCHUwBjNHrys,singer-songwriter,0.909000,...,0.0985,-24.306,0,0.0367,100.523,4,0.1560,https://p.scdn.co/mp3-preview/90e97bb4681e8e7f...,https://i.scdn.co/image/ab67616d0000b273e55d5b...,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12946,11471,love,Everyday (#36),Dave Matthews Band,['nocturnal' 'ambitious' 'carefree' 'freewheel...,4.017569,6.008692,5Q2ZrpifaZkCuk6XIPH1cb,acoustic,0.487000,...,0.9730,-7.505,1,0.0359,92.463,4,0.4140,https://p.scdn.co/mp3-preview/1e1954f2c20ee681...,https://i.scdn.co/image/ab67616d0000b273ebfdb8...,28.0
12947,11520,love,old dirt hill,Dave Matthews Band,['nocturnal' 'ambitious' 'carefree' 'freewheel...,3.843571,5.893571,1GCaJJ1JoTTlD6SvNTVlqo,rock,0.297000,...,0.2260,-6.271,1,0.0312,113.533,4,0.8710,https://p.scdn.co/mp3-preview/d106766a8fba79ff...,https://i.scdn.co/image/ab67616d0000b273c7ad81...,33.0
12948,11540,love,Everybody Wake Up,Dave Matthews Band,['nocturnal' 'ambitious' 'carefree' 'freewheel...,3.843571,5.893571,3YQh1MeWtF2doriJTZTzO6,rock,0.031700,...,0.1640,-4.162,1,0.0399,110.993,4,0.4550,https://p.scdn.co/mp3-preview/3b60b984e24e2bc0...,https://i.scdn.co/image/ab67616d0000b273c7ad81...,27.0
12949,11544,love,Stand Up,Dave Matthews Band,['nocturnal' 'ambitious' 'carefree' 'freewheel...,3.843571,5.893571,0UxvDe61JZFfDopF1hM6Hp,rock,0.073200,...,0.3260,-5.958,1,0.1280,123.898,4,0.7380,https://p.scdn.co/mp3-preview/06a74e81897f724e...,https://i.scdn.co/image/ab67616d0000b273c7ad81...,27.0


We retrieve the features we are interested for the playlist on the website. 

In [15]:
#for each emotion, retrieve subset of tracks
emotions = df_final['emotions'].unique()
emotion2tracks = {}

for emotion in emotions:
    df_emotion = df_final[df_final['emotions'] == emotion][['track_id', 'track', 'artist', 'preview_url', 'image_url', 'popularity', 'genre']] 
    df_emotion = df_emotion.sort_values(by='popularity', ascending=False)
    
    #Store as a list of dictionaries to then convert to JSON
    tracks_list = []
    for index, row in df_emotion.iterrows():
        track = {
            'track': row['track'],
            'artist': row['artist'],
            'preview_url': row['preview_url'],
            'image_url': row['image_url'],
            'play' : 0,
            'genre' : row['genre']
        }
        tracks_list.append(track)
    emotion2tracks[emotion] = tracks_list

#Save as JSON file
with open('data/data_playlists.json', 'w') as fp:
    json.dump(emotion2tracks, fp)

#Print number of tracks per emotion
for key, value in emotion2tracks.items():
    print(key, len(value))

anticipation 33
anger 341
disgust 74
contempt 414
sadness 1735
remorse 337
surprise 3
fear 423
awe 5
trust 394
submission 1
joy 1948
optimism 356
love 747


## 2. ARTISTS DATA : Retrieve pictures of top 10 artists per emotion. 

In [16]:
#Get pictures of artists
def get_artist_features(track_id, access_token):
    """
    Get audio features from Spotify API for a list of track IDs

    Args:
        track_ids (list): List of Spotify track IDs
        access_token (str): Access token for Spotify API
    
    Returns:
        features_list (list): List of dictionaries containing the audio features for each track
    """
    
    # Spotify API endpoint for getting audio features
    url = 'https://api.spotify.com/v1/tracks/'+track_id

    # header for the request with authorization token
    headers = {
        'Authorization': 'Bearer ' + access_token
    }

    # make the request to the Spotify API
    response = requests.get(url, headers=headers)
    # check if the request was successful
    if response.status_code == 200:
        # get the JSON response content
        response_json = json.loads(response.content)
        # extract uri of the artist
        artist_id = response_json['artists'][0]['id']
        url_image, width, height = get_artist_image(artist_id, access_token)

        return url_image, width, height

    else:
        print('Error:', response.status_code)


#Get pictures of artists
def get_artist_image(artist_id, access_token):
    """
    Get audio features from Spotify API for a list of track IDs

    Args:
        track_ids (list): List of Spotify track IDs
        access_token (str): Access token for Spotify API
    
    Returns:
        features_list (list): List of dictionaries containing the audio features for each track
    """
    
    # Spotify API endpoint for getting audio features
    url = 'https://api.spotify.com/v1/artists/'+artist_id

    # header for the request with authorization token
    headers = {
        'Authorization': 'Bearer ' + access_token
    }

    # make the request to the Spotify API
    response = requests.get(url, headers=headers)
    # check if the request was successful
    if response.status_code == 200:
        # get the JSON response content
        response_json = json.loads(response.content)
        # extract uri of the artist

        #If there is no image, return None
        if len(response_json['images']) == 0:
            print(artist_id)

            return None, None, None
        
        artist_url = response_json['images'][0]['url']
        width = response_json['images'][0]['width']
        height = response_json['images'][0]['height']

        return artist_url, width, height

    else:
        print(artist_id)
        print('Error:', response.status_code)

In [None]:
#Read the JSON file
with open('data/top_artists.json') as json_file:
    top_artists = json.load(json_file)

#Retrieve access_token
access_token = get_tokens()

#Get the picture of each artist
for key, value in top_artists.items():
    for val in value : 
        artist_url, width, height = get_artist_features(val['track_id'], access_token)
        val['image_url'] = artist_url
        val['width'] = width
        val['height'] = height

#Write the JSON file
with open('data/top_artists.json', 'w') as fp:
    json.dump(top_artists, fp)

## 3. Retrieve mean of audio features for each emotion. 

In [4]:
df = pd.read_csv('data/tracks_final.csv')

#Keep only the columns we need
to_keep = ['emotions', 'danceability', 'arousal', 'dominance', 'energy', 'loudness',
             'acousticness', 'instrumentalness', 'valence']
df = df[to_keep]

In [5]:
#Scale all audio features so that all values are between 0 and 100
for i in to_keep[1:]:
    df[i] = round((df[i] - df[i].min()) / (df[i].max() - df[i].min()) * 100).astype(int)

df

Unnamed: 0,emotions,danceability,arousal,dominance,energy,loudness,acousticness,instrumentalness,valence
0,anticipation,84,51,55,12,57,80,0,47
1,anticipation,37,86,75,34,57,0,91,6
2,anticipation,31,66,60,58,78,9,0,9
3,anticipation,67,57,65,43,74,3,0,51
4,anticipation,48,64,50,80,72,2,87,70
...,...,...,...,...,...,...,...,...,...
12985,love,59,57,80,70,79,10,0,15
12986,love,31,61,78,18,58,94,0,16
12987,love,76,60,80,6,62,99,0,7
12988,love,43,67,88,87,83,0,0,27


In [6]:
#Retrieve emotions
emotions = df['emotions'].unique()
dict_features = {}

#For each emotion, compute the mean of each audio feature
for emotion in emotions : 
    print(emotion)
    df_emotion = df[df['emotions'] == emotion]
    df_emotion_mean = df_emotion.mean()
    df_emotion_mean = df_emotion_mean.round(decimals=1) #Round to 1 decimal
    df_emotion_mean = df_emotion_mean.sort_values(ascending=False).to_dict() #Sort by descending order
    dict_features[emotion] = df_emotion_mean

#Write the JSON file
with open('data/features.json', 'w') as fp:
    json.dump(dict_features, fp)

anticipation
anger
disgust
contempt
sadness
remorse
surprise
fear
awe
trust
submission
joy
optimism
love


  df_emotion_mean = df_emotion.mean()
