Danielle Paes Barretto de Arruda Camara

**VERSION: 30-07-19 (last review)**

Retrieve tracks metadata and audio features from playlists.

Uses the data obtained using notebook: **01-obtain_playlists_per_category.ipynb**

**Input:** 

* .csv file with playlists info (e.g. playlists_category_toplists_NL_2019-06-02.csv)

**Output:**
* .csv file containing metadata as well as audio feature inforation of tracks (e.g.  
"tracks_info_category_summer_playlist_id_37i9dQZF1DX83I5je4W4rP_2019-06-02.csv")


**Attention:** In order to obtain playlist info for all categories ("playlist_info_category_ALL_2019-06-02.csv") at once the files need to be concatenated. This can be done using notebook **concatenating_all_categories_playlists_info.ipynb**




# Import libraries

In [1]:
import pandas as pd
from tqdm import tqdm

import time
TodaysDate = time.strftime("%Y-%m-%d")

# Folder(s)

In [2]:
input_folder = "./data/NEW_DATA/playlists/"
tracks_info_folder = "./data/NEW_DATA/tracks/"

# Access to Spotify API 

For credentials : https://developer.spotify.com/dashboard/login


In [3]:
import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials 
cid ="********************************" 
secret = "********************************" 

# maybe redirect uri will be required for some of the commands 
redirect_uri = 'http://127.0.0.1:5001/login/authorized'

# the user id of my account
username = 'your_spotify_user_name'

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret) 
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Functions

In [4]:
def get_playlist_tracks(user,playlist_id):
    """ Get tracks from a playlist
    
    Input:
        user: the spotify id of the user
        playlist_id: Spotify Id of a playlist (base-62 identifier, e.g 6rqhFgbbKwnb9MLmUQDhG6) 
        
    Output:
        tracks from playlist
    
    """
    
    results = sp.user_playlist_tracks(user,playlist_id)
    tracks = results['items']
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    return tracks

def retrieve_tracks_from_playlist(user,playlist_id,playlist_name):
    """ retrieve info of a selected playlist
    Input: 
        user : the spotify id of the user
        playlist_id: Spotify Id of a playlist (base-62 identifier, e.g 6rqhFgbbKwnb9MLmUQDhG6) 
        playlist_name: Name of the playlist
    
    Output:
            'playlist_name': playlist name,
            'playlist_id': playlist id,
            'track_id' : track id,
            'track_uri' : track uri,
            'track_name' : track name,
            'track_duration' : track duration in ms,
            'track_popularity' : track popularity (max=100),
           'artist_name':artist_name (sometimes there is more than one artist listed),
           'album_name':album_name}
    
    """
    
    pl_name = []
    pl_id = []
    track_id = []
    track_uri = []
    track_preview_url = []
    track_name = []
    track_duration = []
    track_popularity = []
    artist_name = []
    album_name = []
    
    track_results = get_playlist_tracks(user,playlist_id)
    
    for track in range(len(track_results)):
        try:
            pl_name.append(playlist_name) 
            pl_id.append(playlist_id) 
            track_id.append(track_results[track]['track']['id'])
            track_uri.append(track_results[track]['track']['uri'])
            track_preview_url.append(track_results[track]['track']['preview_url'])
            track_name.append(track_results[track]['track']['name'])
            track_duration.append(track_results[track]['track']['duration_ms'])
            track_popularity.append(track_results[track]['track']['popularity'])
            sub_artist_name = []
            for i in range(len(track_results[track]['track']['artists'])):
                sub_artist_name.append(track_results[track]['track']['artists'][i]['name'])
            artist_name.append(','.join(sub_artist_name))
            album_name.append(track_results[track]['track']['album']['name'])
        except TypeError:
            continue
        

    return {'playlist_name': playlist_name,
            'playlist_id' : playlist_id,
            'track_id' : track_id,
            'track_uri' : track_uri,
            'track_preview_url': track_preview_url,
            'track_name' : track_name,
            'track_duration' : track_duration,
            'track_popularity' : track_popularity,
            'artist_name':artist_name,
            'album_name':album_name}

def retrieve_audio_features_tracks_playlist(track_ids):
    """ Retrieve audio features of tracks of a playlist
    
    Input: 
        track_ids: list of track_ids
    
    Output:
        audio_features: list of dictionary with audiofeatures for each track_id in track_ids
    """
    
    audio_features = []
    batchsize = 100
    None_counter = 0
    
    for i in range(0,len(track_ids),batchsize):
        batch = track_ids[i:i+batchsize]
        feature_results = sp.audio_features(batch)
        for i, t in enumerate(feature_results):
            if t == None:
                None_counter = None_counter + 1
            else:
                audio_features.append(t)
    
    return audio_features

def merge_and_clean_category(user,playlist_id,playlist_name,category_id):
    
    """ Merge all information of tracks of a playlist, select relevant features, 
    generate a dataframe and save it in a .csv file
    
    Input: 
        user : the spotify id of the user
        playlist_id: Spotify Id of a playlist (base-62 identifier, e.g 6rqhFgbbKwnb9MLmUQDhG6) 
        playlist_name: Name of the playlist
    
    Output: 
    
        df_tracks_complete: dataframe with information about tracks of a playlist (metadata and audio features).
    """
    
    tracks_playlist = retrieve_tracks_from_playlist(user,playlist_id,playlist_name)
    tracks_ids = tracks_playlist['track_id']
    results_audio = retrieve_audio_features_tracks_playlist(tracks_ids)
  
    df_tracks = pd.DataFrame(tracks_playlist)
    df_audio_features = pd.DataFrame(results_audio)
    
    df_tracks_complete = df_tracks.merge(df_audio_features,right_on='id',left_on='track_id')
    columns_to_remove = ['analysis_url','id','time_signature','track_href', 'type', 
                         'uri','track_duration']
    
    df_tracks_complete.drop(columns=columns_to_remove,inplace=True)
    df_tracks_complete['category_id'] = category_id
    
    df_tracks_complete = df_tracks_complete[['category_id','playlist_name', 'playlist_id', 'track_id', 'track_uri',
       'track_preview_url', 'track_name', 'track_popularity', 'artist_name',
       'album_name', 'acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'valence']]
    

    # save dataframe in csv
    
    filename = "tracks_info_category_"+category_id+"_playlist_id_"+playlist_id+"_"+TodaysDate +".csv"
    df_tracks_complete.to_csv(tracks_info_folder+filename,index = False)
        
#     return df_tracks_complete

# Retrieving audio features

## Retrieving audio features for one playlist

In [5]:
user = username
merge_and_clean_category(user,playlist_id='37i9dQZF1DX35vibahyjHG',playlist_name='Phenomenal Woman',category_id='afro')

In [6]:
df_test = pd.read_csv(tracks_info_folder+"tracks_info_category_afro_playlist_id_37i9dQZF1DX35vibahyjHG_2019-06-02.csv")

In [7]:
df_test.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,afro,Phenomenal Woman,37i9dQZF1DX35vibahyjHG,6KSRlzHjxjm2owDJIku5Xf,spotify:track:6KSRlzHjxjm2owDJIku5Xf,https://p.scdn.co/mp3-preview/8477319c0ac7e062...,Without You,19,Omawumi,Without You,...,191000,0.77,2e-06,10,0.0792,-5.229,0,0.368,95.952,0.84
1,afro,Phenomenal Woman,37i9dQZF1DX35vibahyjHG,6wNWxYdoTKuypUxnBh9Eah,spotify:track:6wNWxYdoTKuypUxnBh9Eah,https://p.scdn.co/mp3-preview/0cb54b0cdfc7b78f...,Artificial Heart,21,Yanga,Promised Land,...,185143,0.467,0.0,9,0.133,-7.846,1,0.087,139.766,0.335
2,afro,Phenomenal Woman,37i9dQZF1DX35vibahyjHG,0NMfKNtRnvyF9QjlRvnv0I,spotify:track:0NMfKNtRnvyF9QjlRvnv0I,https://p.scdn.co/mp3-preview/bc136ce698c1b1fb...,Spend Some Time,39,"Amaarae,Wande Coal",Spend Some Time,...,168333,0.419,0.00301,0,0.104,-13.102,0,0.103,101.025,0.257
3,afro,Phenomenal Woman,37i9dQZF1DX35vibahyjHG,0eeIwG0vTlzUpkpeKA4MuK,spotify:track:0eeIwG0vTlzUpkpeKA4MuK,https://p.scdn.co/mp3-preview/0e5e861a48d4327e...,The Beginning,45,Aṣa,The Beginning - Single,...,224147,0.506,0.000183,6,0.0503,-6.648,1,0.0288,158.025,0.349
4,afro,Phenomenal Woman,37i9dQZF1DX35vibahyjHG,2FIECkWDoS7xEGcYbI6UhN,spotify:track:2FIECkWDoS7xEGcYbI6UhN,https://p.scdn.co/mp3-preview/45e8604571b50b4e...,Ng'yesaba,17,Tabia,The Journey,...,234333,0.802,0.000775,6,0.148,-6.81,0,0.0466,88.021,0.714


In [8]:
df_test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 22 columns):
category_id          63 non-null object
playlist_name        63 non-null object
playlist_id          63 non-null object
track_id             63 non-null object
track_uri            63 non-null object
track_preview_url    50 non-null object
track_name           63 non-null object
track_popularity     63 non-null int64
artist_name          63 non-null object
album_name           63 non-null object
acousticness         63 non-null float64
danceability         63 non-null float64
duration_ms          63 non-null int64
energy               63 non-null float64
instrumentalness     63 non-null float64
key                  63 non-null int64
liveness             63 non-null float64
loudness             63 non-null float64
mode                 63 non-null int64
speechiness          63 non-null float64
tempo                63 non-null float64
valence              63 non-null float64
dtypes: floa

## Retrieving audio features of all playlists in one chosen category

### Example: all playlist in category 'blues'

In [9]:
# recovering dataframe with information about playlist in category afro - this was obtained using 
# notebook '01-obtain_playlists_per_category'

df_blues = pd.read_csv(input_folder+"playlists_category_blues_NL_2019-06-02.csv")
df_blues.head()

Unnamed: 0,category,playlist_id,playlist_name
0,blues,37i9dQZF1DXdkAbM8agIbA,From the Delta to Chicago
1,blues,37i9dQZF1DXbkKnGZHv1kf,Blues Origins
2,blues,37i9dQZF1DX2iUghHXGIjj,Acoustic Blues
3,blues,37i9dQZF1DX9stbPFTxeaB,"Funky, Heavy, Bluesy"
4,blues,37i9dQZF1DWSKpvyAAcaNZ,Blues & Roots Rock


In [10]:
def retrieve_audio_features_playlists(user,df):
    """ Retrieve audio features of playlists and save information in a csv file
    
    Input:
        user: Spotify username 
        df: dataframe with informations of playlist for one or more categories
    
    Output: csv files with audio features of tracks of playlists which information is in df 
    """

    for idx in tqdm(range(df.shape[0])):
        count = 0
        
        try:
            category_id = df.category[idx]
            playlist_id = df.playlist_id[idx]
            playlist_name = df.playlist_name[idx]
            merge_and_clean_category(user,playlist_id,playlist_name,category_id)
        except HTTPError:
            count = count + 1
            print('error', count)
            pass
        
    print("Process finished!")

In [11]:
user = username
df = df_blues

retrieve_audio_features_playlists(user,df)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [00:11<00:00,  1.34it/s]


Process finished!


In [12]:
# recovering info from one playlist within blues
df_tracks_blues = pd.read_csv(tracks_info_folder+"tracks_info_category_blues_playlist_id_37i9dQZF1DX5AuRugisweW_2019-06-02.csv")

In [13]:
df_tracks_blues.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 22 columns):
category_id          58 non-null object
playlist_name        58 non-null object
playlist_id          58 non-null object
track_id             58 non-null object
track_uri            58 non-null object
track_preview_url    46 non-null object
track_name           58 non-null object
track_popularity     58 non-null int64
artist_name          58 non-null object
album_name           58 non-null object
acousticness         58 non-null float64
danceability         58 non-null float64
duration_ms          58 non-null int64
energy               58 non-null float64
instrumentalness     58 non-null float64
key                  58 non-null int64
liveness             58 non-null float64
loudness             58 non-null float64
mode                 58 non-null int64
speechiness          58 non-null float64
tempo                58 non-null float64
valence              58 non-null float64
dtypes: floa

In [14]:
df_tracks_blues.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,blues,Crossroad Blues,37i9dQZF1DX5AuRugisweW,5dDaQf3QUWIpukuGFDhScm,spotify:track:5dDaQf3QUWIpukuGFDhScm,,Me & the Devil Blues,0,Jonah Tolchin,Eldawise,...,231000,0.424,0.00513,7,0.198,-9.046,1,0.0376,110.584,0.506
1,blues,Crossroad Blues,37i9dQZF1DX5AuRugisweW,4qrUQuQ2Nt9jgZW99rBcbo,spotify:track:4qrUQuQ2Nt9jgZW99rBcbo,https://p.scdn.co/mp3-preview/16e7bdd07776b8bc...,Restless Sinner,29,Black Rebel Motorcycle Club,Howl,...,191987,0.281,0.0,2,0.222,-12.363,0,0.036,120.485,0.298
2,blues,Crossroad Blues,37i9dQZF1DX5AuRugisweW,1mpkTTUxWTB3FlO2OlRIB4,spotify:track:1mpkTTUxWTB3FlO2OlRIB4,https://p.scdn.co/mp3-preview/7087f78b77b349f6...,Seven Hells,42,Brown Bird,Fits of Reason,...,199120,0.613,0.00134,9,0.0945,-9.51,0,0.0838,143.165,0.545
3,blues,Crossroad Blues,37i9dQZF1DX5AuRugisweW,341o4T7XtSZUKeQvIw2wms,spotify:track:341o4T7XtSZUKeQvIw2wms,,Devil's Resting Place,2,Laura Marling,Once I Was An Eagle,...,194467,0.59,0.0636,2,0.0934,-12.299,0,0.0916,179.267,0.713
4,blues,Crossroad Blues,37i9dQZF1DX5AuRugisweW,1TrGdXSgiBm8W68D2K1COG,spotify:track:1TrGdXSgiBm8W68D2K1COG,https://p.scdn.co/mp3-preview/15f4fb795c08eade...,Cross Road Blues,57,Robert Johnson,King Of The Delta Blues Singers,...,149560,0.311,0.000261,9,0.102,-11.959,1,0.0525,97.159,0.638


In [16]:
# number of unique track_id in this playlist
len(df_tracks_blues.track_id.unique())

58

In [20]:
# address for the snippet (30-sec sample of the second track in df_tracks_blues)
df_tracks_blues.track_preview_url[1]

'https://p.scdn.co/mp3-preview/16e7bdd07776b8bcfb0a565939450253451b895b?cid=63a77ff40e794a12ae216a18befde98c'

## Retrieve audio features for all tracks for all playlists

It is only necessary to apply function retrieve_audio_features_playlists(user,df) using as df a data frame obtained from the .csv with all categories' playlists info. 


**Attention:** In order to obtain playlist info for all categories together the files obtained with notebook 01 need to be concatenated. This can be done using notebook **concatenating_all_categories_playlists_info.ipynb**

In [21]:
# retrieve info about all playlist

df = pd.read_csv(input_folder+ "playlist_info_category_ALL_2019-06-02.csv")

In [22]:
df.shape

(1587, 3)

In [23]:
df.head()

Unnamed: 0,category,playlist_id,playlist_name
0,afro,37i9dQZF1DWYkaDif7Ztbp,African Heat
1,afro,37i9dQZF1DX6036iaZ2MYP,Peppeh
2,afro,37i9dQZF1DX6BsbcWKm1XO,Gold Mine
3,afro,37i9dQZF1DWT6SJaitNDax,We Everywhere
4,afro,37i9dQZF1DX2TExIcUe1gg,Shuga Cane


In [24]:
retrieve_audio_features_playlists(user,df)

100%|██████████████████████████████████████████████████████████████████████████████| 1587/1587 [10:19<00:00,  2.56it/s]


Process finished!


In [25]:
# testing 

df_test = pd.read_csv(tracks_info_folder+"tracks_info_category_summer_playlist_id_37i9dQZF1DX83I5je4W4rP_2019-06-02.csv")
df_test.head()

Unnamed: 0,category_id,playlist_name,playlist_id,track_id,track_uri,track_preview_url,track_name,track_popularity,artist_name,album_name,...,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,summer,Beach Vibes,37i9dQZF1DX83I5je4W4rP,78v5ljpPryjGY4r1wndiqa,spotify:track:78v5ljpPryjGY4r1wndiqa,https://p.scdn.co/mp3-preview/ccbd4e2281a72270...,Knocking at Your Door,55,O.A.R.,The Mighty,...,216200,0.751,0.0,6,0.0527,-5.671,0,0.0449,89.973,0.835
1,summer,Beach Vibes,37i9dQZF1DX83I5je4W4rP,7EIHl1xmO7UEtTEPczPj8i,spotify:track:7EIHl1xmO7UEtTEPczPj8i,https://p.scdn.co/mp3-preview/f0794a54b31c6d8a...,Island Time,52,Bumpin Uglies,Island Time,...,258773,0.751,0.00155,6,0.0643,-4.063,1,0.039,151.884,0.961
2,summer,Beach Vibes,37i9dQZF1DX83I5je4W4rP,4tNYuLrlxTb8DSVL3bO9F1,spotify:track:4tNYuLrlxTb8DSVL3bO9F1,https://p.scdn.co/mp3-preview/d411bc9427fe8da1...,Sunday Vibe,51,"Cisco Adler,G. Love & Special Sauce",Sunday Vibe,...,186760,0.662,0.0,7,0.136,-7.008,1,0.198,155.78,0.532
3,summer,Beach Vibes,37i9dQZF1DX83I5je4W4rP,5JmJVj3qLsCnBsQ8IC9XLf,spotify:track:5JmJVj3qLsCnBsQ8IC9XLf,https://p.scdn.co/mp3-preview/7317d8f850700e9d...,Lazy Afternoon,58,Rebelution,Bright Side of Life,...,210347,0.678,0.0105,7,0.125,-6.915,1,0.0351,147.847,0.93
4,summer,Beach Vibes,37i9dQZF1DX83I5je4W4rP,0TGLKJiH9vHx4Xb8FVjWrO,spotify:track:0TGLKJiH9vHx4Xb8FVjWrO,https://p.scdn.co/mp3-preview/29b81f2d8263059d...,Warning (feat. Stick Figure),55,"Stick Figure,Pepper",Warning (feat. Stick Figure),...,232160,0.527,0.118,1,0.131,-7.191,0,0.118,76.022,0.52


In [26]:
df_test.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 22 columns):
category_id          50 non-null object
playlist_name        50 non-null object
playlist_id          50 non-null object
track_id             50 non-null object
track_uri            50 non-null object
track_preview_url    36 non-null object
track_name           50 non-null object
track_popularity     50 non-null int64
artist_name          50 non-null object
album_name           50 non-null object
acousticness         50 non-null float64
danceability         50 non-null float64
duration_ms          50 non-null int64
energy               50 non-null float64
instrumentalness     50 non-null float64
key                  50 non-null int64
liveness             50 non-null float64
loudness             50 non-null float64
mode                 50 non-null int64
speechiness          50 non-null float64
tempo                50 non-null float64
valence              50 non-null float64
dtypes: floa