In [7]:
# https://medium.com/@rafaelnduarte/how-to-retrieve-data-from-spotify-110c859ab304
# https://github.com/plamere/spotipy
# https://spotipy.readthedocs.io/en/2.19.0/#api-reference

import os
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth

# Get environment variables
SPOTIPY_CLIENT_ID = os.environ.get('SPOTIPY_CLIENT_ID')
SPOTIPY_CLIENT_SECRET = os.environ.get('SPOTIPY_CLIENT_SECRET')
SPOTIPY_REDIRECT_URI = os.environ.get('SPOTIPY_REDIRECT_URI')

# Authenticate credentials
scope = "user-library-read"
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [8]:
# Get tracks data for all saved tracks
artist_name = []
track_name = []
popularity = []
track_id = []

results = sp.current_user_saved_tracks()        
        
for i in range(0, 1682, 20):
    results = sp.current_user_saved_tracks(limit = 20, offset = i)
    for t in results['items']:
        artist_name.append(t['track']['artists'][0]['name'])
        track_name.append(t['track']['name'])
        track_id.append(t['track']['id'])
        popularity.append(t['track']['popularity'])
        
# Check if number of tracks is right - should be 1682
print(len(track_id))

1009


In [5]:
# Create dataframe with the data retrieved
df_mytracks = pd.DataFrame({'artist_name': artist_name, 'track_name': track_name, 'track_id': track_id, 'popularity': popularity})

# check if everything is ok
print(df_mytracks.shape)
df_mytracks.head()

(1009, 4)


Unnamed: 0,artist_name,track_name,track_id,popularity
0,Gorgon City,Oxygen - Terrace Dub,2Ij6YmwBU1mMNBJDJuHqyU,44
1,Hoodboi,True Colors,04FJtRhH1VIR1jMPHRRKh3,34
2,Farruko,Pepas,5fwSHlTEWpluwOM0Sxnh5k,88
3,EVAN GIIA,WESTWORLD,6ZP2iPx7t4epRBAKWvRPt1,60
4,Young Thug,Livin It Up (with Post Malone & A$AP Rocky),7zjEyeBsaw9gV0jofJLfOM,75


In [11]:
# Group the entries by artist_name and track_name
grouped = df_mytracks.groupby(['artist_name','track_name'], as_index=True).size()

# Check for duplicates
grouped[grouped > 1].count() # 36 duplicates

13

In [12]:
# Drop duplicates
df_mytracks.drop_duplicates(subset=['artist_name','track_name'], inplace=True)

In [13]:
# Retrieve audio features from liked songs
rows = []
batchsize = 100
None_counter = 0

for i in range(0,len(df_mytracks['track_id']), batchsize):
    batch = df_mytracks['track_id'][i:i+batchsize]
    feature_results = sp.audio_features(batch)
    for i, t in enumerate(feature_results):
        if t == None:
            None_counter = None_counter + 1
        else:
            rows.append(t)
            
print('Number of tracks where no audio features were available:', None_counter)

Number of tracks where no audio features were available: 1


In [14]:
# Saving the features in a data frame
df_audio_features = pd.DataFrame.from_dict(rows, orient='columns')

# Check if df was created properly
print("Shape of the dataset:", df_audio_features.shape) 
df_audio_features.head()

Shape of the dataset: (993, 18)


Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.688,0.648,1,-10.1,1,0.0429,0.00216,0.123,0.106,0.0548,124.02,audio_features,2Ij6YmwBU1mMNBJDJuHqyU,spotify:track:2Ij6YmwBU1mMNBJDJuHqyU,https://api.spotify.com/v1/tracks/2Ij6YmwBU1mM...,https://api.spotify.com/v1/audio-analysis/2Ij6...,249677,4
1,0.84,0.593,10,-8.976,0,0.137,0.641,0.708,0.328,0.169,110.013,audio_features,04FJtRhH1VIR1jMPHRRKh3,spotify:track:04FJtRhH1VIR1jMPHRRKh3,https://api.spotify.com/v1/tracks/04FJtRhH1VIR...,https://api.spotify.com/v1/audio-analysis/04FJ...,236182,4
2,0.762,0.766,7,-3.955,1,0.0343,0.00776,7e-05,0.128,0.442,130.001,audio_features,5fwSHlTEWpluwOM0Sxnh5k,spotify:track:5fwSHlTEWpluwOM0Sxnh5k,https://api.spotify.com/v1/tracks/5fwSHlTEWplu...,https://api.spotify.com/v1/audio-analysis/5fwS...,287120,4
3,0.712,0.665,11,-6.128,1,0.0514,0.206,2.7e-05,0.137,0.274,112.017,audio_features,6ZP2iPx7t4epRBAKWvRPt1,spotify:track:6ZP2iPx7t4epRBAKWvRPt1,https://api.spotify.com/v1/tracks/6ZP2iPx7t4ep...,https://api.spotify.com/v1/audio-analysis/6ZP2...,209224,4
4,0.767,0.313,7,-12.059,1,0.0798,0.838,0.0,0.105,0.765,82.582,audio_features,7zjEyeBsaw9gV0jofJLfOM,spotify:track:7zjEyeBsaw9gV0jofJLfOM,https://api.spotify.com/v1/tracks/7zjEyeBsaw9g...,https://api.spotify.com/v1/audio-analysis/7zjE...,210907,4


In [15]:
# Drop unnecessary columns
columns_to_drop = ['analysis_url','track_href','type','uri']
df_audio_features.drop(columns_to_drop, axis=1, inplace=True)

# Rename track_id column to id
df_audio_features.rename(columns={'id': 'track_id'}, inplace=True)

# Check dimensions of data frame
df_audio_features.shape

(993, 14)

In [16]:
# Merge dataframes
df = pd.merge(df_mytracks, df_audio_features, on='track_id', how='inner')

# Check if merge was successful
print("Shape of the dataset:", df_audio_features.shape)
df.head()

Shape of the dataset: (993, 14)


Unnamed: 0,artist_name,track_name,track_id,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Gorgon City,Oxygen - Terrace Dub,2Ij6YmwBU1mMNBJDJuHqyU,44,0.688,0.648,1,-10.1,1,0.0429,0.00216,0.123,0.106,0.0548,124.02,249677,4
1,Hoodboi,True Colors,04FJtRhH1VIR1jMPHRRKh3,34,0.84,0.593,10,-8.976,0,0.137,0.641,0.708,0.328,0.169,110.013,236182,4
2,Farruko,Pepas,5fwSHlTEWpluwOM0Sxnh5k,88,0.762,0.766,7,-3.955,1,0.0343,0.00776,7e-05,0.128,0.442,130.001,287120,4
3,EVAN GIIA,WESTWORLD,6ZP2iPx7t4epRBAKWvRPt1,60,0.712,0.665,11,-6.128,1,0.0514,0.206,2.7e-05,0.137,0.274,112.017,209224,4
4,Young Thug,Livin It Up (with Post Malone & A$AP Rocky),7zjEyeBsaw9gV0jofJLfOM,75,0.767,0.313,7,-12.059,1,0.0798,0.838,0.0,0.105,0.765,82.582,210907,4


In [17]:
# Create csv file with data retrieved
df.to_csv('spotify_hentracks_data.csv')