In [69]:
# https://medium.com/@rafaelnduarte/how-to-retrieve-data-from-spotify-110c859ab304
# https://github.com/plamere/spotipy
# https://spotipy.readthedocs.io/en/2.19.0/#api-reference

import os
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials

os.environ['SPOTIPY_REDIRECT_URI'] = 'https://gui-portfolio.herokuapp.com/'

# Get environment variables
SPOTIPY_CLIENT_ID = os.environ.get('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.environ.get('SPOTIPY_CLIENT_SECRET')
SPOTIPY_REDIRECT_URI = os.environ.get('SPOTIPY_REDIRECT_URI')

# Authenticate credentials
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [70]:
# Get tracks data for all saved tracks
artist_name = []
track_name = []
popularity = []
track_id = []

results = sp.current_user_saved_tracks()        
        
for i in range(0, 1682, 20):
    results = sp.current_user_saved_tracks(limit = 20, offset = i)
    for t in results['items']:
        artist_name.append(t['track']['artists'][0]['name'])
        track_name.append(t['track']['name'])
        track_id.append(t['track']['id'])
        popularity.append(t['track']['popularity'])
        
# Check if number of tracks is right - should be 1682
print(len(track_id))

1682


In [79]:
# Create dataframe with the data retrieved
df_mytracks = pd.DataFrame({'artist_name': artist_name, 'track_name': track_name, 'track_id': track_id, 'popularity': popularity})

# check if everything is ok
print(df_mytracks.shape)
df_mytracks.head()

(1682, 4)


Unnamed: 0,artist_name,track_name,track_id,popularity
0,Adventure,Wipe Out-Let's Go!,11fYDBbxgKGVio1i9AkAIB,0
1,The Jackson 5,I Want You Back,5LxvwujISqiB8vpRYv887S,74
2,Daryl Hall & John Oates,Maneater,7j74lucZ59vqN67Ipe2ZcY,72
3,Electric Light Orchestra,Mr. Blue Sky,2RlgNHKcydI9sayD2Df2xp,78
4,Marvin Gaye,Ain't No Mountain High Enough,7tqhbajSfrz2F7E1Z75ASX,79


In [80]:
# Group the entries by artist_name and track_name
grouped = df_mytracks.groupby(['artist_name','track_name'], as_index=True).size()

# Check for duplicates
grouped[grouped > 1].count() # 36 duplicates

36

In [81]:
# Drop duplicates
df_mytracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [83]:
# Retrieve audio features from liked songs
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(df_mytracks['track_id']), batchsize):
    batch = df_mytracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:', None_counter)

Number of tracks where no audio features were available: 0


In [86]:
# Saving the features in a data frame
df_audio_features = pd.DataFrame.from_dict(rows, orient='columns')

# Check if df was created properly
print("Shape of the dataset:", df_audio_features.shape) 
df_audio_features.head()

Shape of the dataset: (1646, 18)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.306,0.581,0,-11.334,1,0.0383,0.0631,0.911,0.0988,0.895,166.307,audio_features,11fYDBbxgKGVio1i9AkAIB,spotify:track:11fYDBbxgKGVio1i9AkAIB,https://api.spotify.com/v1/tracks/11fYDBbxgKGV...,https://api.spotify.com/v1/audio-analysis/11fY...,224693,4
1,0.469,0.538,8,-13.559,1,0.0571,0.305,0.00012,0.37,0.886,196.605,audio_features,5LxvwujISqiB8vpRYv887S,spotify:track:5LxvwujISqiB8vpRYv887S,https://api.spotify.com/v1/tracks/5LxvwujISqiB...,https://api.spotify.com/v1/audio-analysis/5Lxv...,176333,4
2,0.724,0.498,11,-13.083,0,0.0423,0.0405,8.6e-05,0.0761,0.823,88.825,audio_features,7j74lucZ59vqN67Ipe2ZcY,spotify:track:7j74lucZ59vqN67Ipe2ZcY,https://api.spotify.com/v1/tracks/7j74lucZ59vq...,https://api.spotify.com/v1/audio-analysis/7j74...,273173,4
3,0.388,0.338,10,-10.054,1,0.0329,0.652,4e-06,0.248,0.478,177.765,audio_features,2RlgNHKcydI9sayD2Df2xp,spotify:track:2RlgNHKcydI9sayD2Df2xp,https://api.spotify.com/v1/tracks/2RlgNHKcydI9...,https://api.spotify.com/v1/audio-analysis/2Rlg...,303373,4
4,0.663,0.6,7,-10.87,1,0.032,0.43,0.0,0.184,0.8,129.991,audio_features,7tqhbajSfrz2F7E1Z75ASX,spotify:track:7tqhbajSfrz2F7E1Z75ASX,https://api.spotify.com/v1/tracks/7tqhbajSfrz2...,https://api.spotify.com/v1/audio-analysis/7tqh...,151667,4


In [87]:
# Drop unnecessary columns
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1, inplace=True)

# Rename track_id column to id
df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

# Check dimensions of data frame
df_audio_features.shape

(1646, 14)

In [89]:
# Merge dataframes
df = pd.merge(df_mytracks, df_audio_features, on='track_id', how='inner')

# Check if merge was successful
print("Shape of the dataset:", df_audio_features.shape)
df.head()

Shape of the dataset: (1646, 14)


Unnamed: 0,artist_name,track_name,track_id,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Adventure,Wipe Out-Let's Go!,11fYDBbxgKGVio1i9AkAIB,0,0.306,0.581,0,-11.334,1,0.0383,0.0631,0.911,0.0988,0.895,166.307,224693,4
1,The Jackson 5,I Want You Back,5LxvwujISqiB8vpRYv887S,74,0.469,0.538,8,-13.559,1,0.0571,0.305,0.00012,0.37,0.886,196.605,176333,4
2,Daryl Hall & John Oates,Maneater,7j74lucZ59vqN67Ipe2ZcY,72,0.724,0.498,11,-13.083,0,0.0423,0.0405,8.6e-05,0.0761,0.823,88.825,273173,4
3,Electric Light Orchestra,Mr. Blue Sky,2RlgNHKcydI9sayD2Df2xp,78,0.388,0.338,10,-10.054,1,0.0329,0.652,4e-06,0.248,0.478,177.765,303373,4
4,Marvin Gaye,Ain't No Mountain High Enough,7tqhbajSfrz2F7E1Z75ASX,79,0.663,0.6,7,-10.87,1,0.032,0.43,0.0,0.184,0.8,129.991,151667,4


In [91]:
# Create csv file with data retrieved
df.to_csv('spotify_mytracks_data.csv')