In [9]:
# Importing necessary libraries

import spotipy
import pandas as pd
import numpy as np
import time
from requests.exceptions import ReadTimeout
from spotipy.exceptions import SpotifyException
from timeit import default_timer as timer
from datetime import timedelta
from pandas.api.types import CategoricalDtype
import os
import configparser

In [10]:
# Spotify API tokens for access
current_dir = os.path.dirname(os.path.abspath("Kendrick Lamar"))

config_path = os.path.join(current_dir, '..', 'config.ini')

config = configparser.ConfigParser()

# Read the config file
config.read(config_path)
            
client_id = config.get('credentials', 'Client_ID')
client_secret = config.get('credentials', 'Client_Secret')

client_credentials_manager = SpotifyClientCredentials(
                                client_id = client_id, client_secret = client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [6]:
# function to know how long it'll take to scrape the data

def format_time(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    if minutes > 0:
        return f"{minutes} minutes, {seconds} seconds"
    else:
        return f"{seconds} seconds"

In [17]:
# start time
start_time = timer()

# For Burna Boy as a main artist
artist_name = []
track_name = []
track_id = []
album_name = []
album_id = []
release_date = []
popularity = []
explicit = []
danceability = []
duration_ms = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
time_signature = []
featured_artists = []

# loop through results, using offset to get all tracks
for i in range(0, 1000, 50):
    try:
        track_results = sp.search(q='artist: Burna Boy', type='track', limit=50, offset=i)
        for i, t in enumerate(track_results['tracks']['items']):
        
            # get track details
            artist_name.append(t['artists'][0]['name'])
            track_name.append(t['name'])
            track_id.append(t['id'])
            album_name.append(t['album']['name'])
            album_id.append(t['album']['id'])
            release_date.append(t['album']['release_date'])
            popularity.append(t['popularity'])
            explicit.append(t['explicit'])

            # get audio features for track
            audio_features = sp.audio_features(t['id'])[0]
            danceability.append(audio_features['danceability'])
            duration_ms.append(audio_features['duration_ms'])
            energy.append(audio_features['energy'])
            key.append(audio_features['key'])
            loudness.append(audio_features['loudness'])
            mode.append(audio_features['mode'])
            speechiness.append(audio_features['speechiness'])
            acousticness.append(audio_features['acousticness'])
            instrumentalness.append(audio_features['instrumentalness'])
            liveness.append(audio_features['liveness'])
            valence.append(audio_features['valence'])
            tempo.append(audio_features['tempo'])
            time_signature.append(audio_features['time_signature'])
        

            # get featured artists
            if len(t['artists']) > 1:
                feat_artists = []
                for j in range(1, len(t['artists'])):
                    feat_artists.append(t['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
    except ReadTimeout as e:
        print(f"Error: {e}. Retrying in 5 seconds...")
        time.sleep(5)  # Retry after a short delay
    except SpotifyException as e:
        print(f"Spotify API Error: {e}")
        break

# create dataframe from lists
df_raw = pd.DataFrame({
    'artist_name': artist_name,
    'track_name': track_name,
    'track_id': track_id,
    'album_name': album_name,
    'album_id': album_id,
    'release_date': release_date,
    'duration_ms': duration_ms,
    'popularity': popularity,
    'explicit': explicit,
    'danceability': danceability,
    'energy': energy,
    'key': key,
    'loudness': loudness,
    'mode': mode,
    'speechiness': speechiness,
    'acousticness': acousticness,
    'instrumentalness': instrumentalness,
    'liveness': liveness,
    'valence': valence,
    'tempo': tempo,
    'time_signature': time_signature,
    'featured_artists': featured_artists
})



# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

Max Retries reached


Spotify API Error: http status: 429, code:-1 - /v1/audio-features/?ids=7fA7mrYaXVDVVGCAV65NRN:
 Max Retries, reason: too many 429 error responses


ValueError: All arrays must be of the same length

In [36]:
# keeping records with only 'Burna Boy' as the artist name
df_burna = df_raw[df_raw['artist_name']=='Burna Boy']

In [37]:
# Burna's Spotify has one album with two spelling variations: 'On a Spaceship' and 'On A Spaceship' which are basically the same thing and this is proof 

spaceship = df_burna['album_name'].str.contains('spaceship', case=False)
proof = df_burna.loc[spaceship].sort_values(by= 'track_name', ascending=False)
proof.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists
174,Burna Boy,Trance,6QGD01eMx3grmAUcVpZMBr,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,216680,21,False,0.882,...,-9.649,0,0.152,0.204,0.000687,0.133,0.912,107.98,4,[]
238,Burna Boy,Trance,6uSOJlK9AbARydVHicLROO,On A Spaceship,6NW4thIIMVcb6Wedxqu6ev,2015-03-13,216680,15,False,0.883,...,-9.61,0,0.153,0.202,0.000748,0.131,0.922,107.994,4,[]
49,Burna Boy,Soke,4bzS0aUqNJIKiXk86k4tol,On A Spaceship,6NW4thIIMVcb6Wedxqu6ev,2015-03-13,218867,53,False,0.647,...,-11.174,1,0.104,0.599,6e-06,0.102,0.883,203.868,4,[]
116,Burna Boy,Soke,1rFeMjIzNOngxR5sMS6A3I,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,218867,33,False,0.826,...,-11.18,1,0.0558,0.604,7e-06,0.106,0.876,101.989,4,[]
218,Burna Boy,Single (feat. Wizkid),5jMVBG8g9mpMPviRopnY4k,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,211027,27,False,0.862,...,-5.994,0,0.0677,0.326,4e-06,0.105,0.627,101.971,4,[Wizkid]


In [38]:
# Having examined the content of the two album spelling variations, we'll now keep one: 'On a Spaceship'
# It is noteworthy that 'On a Spaceship' is more complete than the other, hence this approach
df_burna.loc[:, 'album_name'] = df_burna['album_name'].replace('On A Spaceship', 'On a Spaceship')

# Filter the main DataFrame for the album 'On a Spaceship'
df_spaceship = df_burna[df_burna['album_name'] == 'On a Spaceship']

# Identify and keep rows with a higher popularity among duplicates based on duration_ms
df_no_duplicates = df_spaceship.sort_values(by=['duration_ms', 'popularity'], ascending=[False, False]).drop_duplicates(subset='duration_ms')

# Create a boolean mask to identify rows related to the album 'On a Spaceship'
mask_spaceship_album = df_burna['album_name'] == 'On a Spaceship'

# Update the main DataFrame by replacing the relevant rows with the processed data
df_burna.loc[mask_spaceship_album, :] = df_no_duplicates

# Reset the index of the main DataFrame
df_burna = df_burna.reset_index(drop=True)

In [39]:
# creating a new column to categorize songs to either Singles, Albums, EPs, Compilations

# Do this separately for the main_track df and the ft df, call it features for the ft df, before concat

albums = ['I Told Them...', 
                    'Love, Damini',
                    'Twice As Tall',
                    'African Giant',
                    'Outside',
                    'On a Spaceship',
                    'L.I.F.E - Leaving an Impact for Eternity (Deluxe Edition)']

ep = ['Redemption', 'Steel & Copper']

compilation = ['Best of Burna Boy', 'Best of Burn Series, Vol. 1']

# Define conditions for each type
single_condition = df_burna['track_name'] == df_burna['album_name']
album_condition = df_burna['album_name'].isin(albums)
ep_condition = df_burna['album_name'].isin(ep)
compilation_condition = df_burna['album_name'].isin(compilation)

# Update 'type' based on conditions
df_burna.loc[single_condition, 'type'] = 'Single'
df_burna.loc[album_condition, 'type'] = 'Album'
df_burna.loc[ep_condition, 'type'] = 'EP'
df_burna.loc[compilation_condition, 'type'] = 'Compilation'

In [29]:
# scraping a playlist that contains all Burna Boy's features 

# start time
start_time = timer()

# Get playlist URI
playlist_uri = "https://open.spotify.com/playlist/0td2WSZ8vJjToLUjq3VELy?si=1d42b850ed6d421b"

# Get playlist tracks
playlist = sp.playlist(playlist_uri)
tracks = playlist['tracks']['items']

# Initialize data structures
song_features = []

# Define offset for pagination
offset = 0
limit = 100

# Iterate through playlist tracks, making additional requests as needed
while True:
    # Get playlist tracks with pagination
    results = sp.user_playlist_tracks(user="Uncle Pat", playlist_id=playlist_uri, offset=offset, limit=limit)

    # Check if there are more tracks to retrieve
    if not results['items']:
        break

    # Extract track information and audio features
    for track in results['items']:
        track_uri = track['track']['uri']
        track_name = track['track']['name']

        # Extract track information
        track_info = sp.track(track_uri)
        artist_name = track_info['artists'][0]['name']
        track_name = track_info['name']
        track_id = track_info['id']
        album_name = track_info['album']['name']
        album_id = track_info['album']['id']
        release_date = track_info['album']['release_date']
        duration_ms = track_info['duration_ms']
        popularity = track_info['popularity']
        explicit = track_info['explicit']
        
        
        # Extract audio features
        track_features = sp.audio_features(track_uri)

        # Check if audio features are available
        if track_features:
            track_features = track_features[0]

            # Extract audio features
            danceability = track_features['danceability']
            energy = track_features['energy']
            key = track_features['key']
            loudness = track_features['loudness']
            mode = track_features['mode']
            speechiness = track_features['speechiness']
            acousticness = track_features['acousticness']
            instrumentalness = track_features['instrumentalness']
            liveness = track_features['liveness']
            valence = track_features['valence']            
            tempo = track_features['tempo']
            time_signature = track_features['time_signature']
            
            # get featured artists
            featured_artists = []
            if len(track_info['artists']) > 1:
                feat_artists = []
                for j in range(1, len(track_info['artists'])):
                    feat_artists.append(track_info['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
   

            # Store data
            song_features.append([
                artist_name, track_name, track_id, album_name,
                album_id, release_date, duration_ms, popularity,
                explicit, danceability, energy, key, loudness,
                mode, speechiness, acousticness, instrumentalness,
                liveness, valence, tempo, time_signature, featured_artists
            ])
        else:
            print("Audio features not available for track:", track_name)

    # Update offset for next request
    offset += limit

# Save data to CSV file
ft = pd.DataFrame(
    song_features, 
    columns=[ 'artist_name', 'track_name', 'track_id', 'album_name', 'album_id',
                'release_date', 'duration_ms', 'popularity', 'explicit',
                'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness',
                'valence', 'tempo', 'time_signature', 'featured_artists'])

# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

Elapsed time: 1 minutes, 0 seconds


In [60]:
ft.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists,type
0,Lil Durk,All My Life (Burna Boy Remix),6vyM1yoPhhdezAcW8CmCJq,All My Life (Remixes) (feat. Burna Boy),11OlFW8YpwZTlhAdc0NCV7,2023-10-13,264179,65,True,0.787,...,1,0.226,0.071,0.0,0.139,0.698,142.945,4,"[[Burna Boy, J. Cole]]",Feature
1,Burna Boy,Talibans II,5zq5fYXIE7X842DD5HBiiB,Talibans II,4EouhaKTJkW8xRpLsCz7va,2023-07-20,176453,66,True,0.914,...,0,0.31,0.227,3e-06,0.132,0.54,102.937,4,[[Byron Messia]],Feature
2,J Hus,Masculine (feat. Burna Boy),4M6yrN4rJUrwNgHzGKfrN1,Beautiful And Brutal Yard,766bxryPZBL0hjz0KM6VUD,2023-07-14,204880,65,True,0.675,...,1,0.291,0.179,2e-05,0.127,0.713,66.98,5,[[Burna Boy]],Feature
3,Burna Boy,Alone,0vLj7fwt3HM36zo7HEghCL,Black Panther: Wakanda Forever - Music From an...,0Wyn0fD9sZIu00xTyXF4eK,2022-11-04,221746,0,False,0.607,...,0,0.049,0.153,0.0,0.113,0.317,90.153,4,[[]],Feature
4,Dave,Location (feat. Burna Boy),3z4CGd63tpUn9a6oQSG0CI,PSYCHODRAMA,4GrFuXwRmEBJec22p58fsD,2019-03-08,241293,80,True,0.812,...,0,0.297,0.271,0.0,0.0955,0.55,109.979,4,[[Burna Boy]],Feature


In [61]:
# creating a new column to categorize songs from this playlist to Features
ft['type']='Feature' 

In [62]:
# joining the df_burna and ft dataframes
df = pd.concat([df_burna, ft], ignore_index= True, sort= False)

In [63]:
# Eliminating rows that share the same track name 

# The main track has higher popularity than duplicate tracks that share the same name hence it was an elimination by popularity

df = df.sort_values(by= ['track_name', 'popularity'], ascending= [True, False])
df.drop_duplicates(subset= 'track_name', keep= 'first', inplace= True, ignore_index= True)

In [64]:
# check the records that wasn't assigned any song type ie. single, album, feature, ep, compilation
notype = df['type'].isna()
flush = df.loc[notype].sort_values(by= 'popularity', ascending= False) 
flush.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists,type
200,Burna Boy,Sittin' on Top of the World,10AY7bWxFWedosyHbWS5Eu,SUMMER AFROBEAT,2un64S6DNNMTkHAwQWfE0i,2023-07-14,159372.0,44.0,True,0.787,...,1.0,0.0516,0.0279,0.0058,0.109,0.681,98.051,4.0,[],
46,Burna Boy,City Boys - Mixed,7v9AmbAXIoFCpAfSz0CHwF,"InterSpace Naija: DJ VENUM, Oct 23 (DJ Mix)",2HaJJYuhULX9mgjwaQOGir,2023-10-01,58250.0,27.0,True,0.77,...,1.0,0.175,0.0694,0.0,0.201,0.606,101.808,4.0,[],
77,Burna Boy,For My Hand,6ZWi0eZoOoAQjZI2y9oPVr,Beat Street,0tHtN1Hf2a5CXKiX6IX1IW,2023-06-09,159124.0,17.0,False,0.87,...,1.0,0.165,0.161,0.000157,0.209,0.717,104.059,4.0,[Ed Sheeran],
54,Burna Boy,Cono,7s0EHnfP8rVucrhwdRuWub,Hood,2ShexYyd4cUzJa2CnvpXFq,2009,179017.0,17.0,False,0.786,...,1.0,0.443,0.173,1.8e-05,0.0966,0.0751,98.217,4.0,[],
18,Burna Boy,"Alone - From ""Black Panther: Wakanda Forever -...",3EqAZ2oeUjys8gc175Vr1D,The Biggest Hits of 2022,5puVkX3wK31KAxt80KFkWx,2023-06-10,221747.0,12.0,False,0.6,...,0.0,0.0542,0.176,0.0,0.111,0.307,89.955,4.0,[],


In [65]:
# flush them
index_flush = flush.index
df = df.drop(index_flush)

In [66]:
# modifying the 'mode' column by replacing 0 with Minor and 1 with Major
df['mode'].replace({0:'Minor', 1:'Major'}, inplace= True)

In [67]:
# modifying the duration_ms column to appear as minutes and seconds 

df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')

# extract the minutes and seconds components of the duration as strings
df['duration'] = df['duration_ms'].dt.components['minutes'].astype(str).str.zfill(2) + ':' + df['duration_ms'].dt.components['seconds'].astype(str).str.zfill(2)
df.drop(columns=['duration_ms'], inplace= True)
df[['track_name','duration']].head()

Unnamed: 0,track_name,duration
0,12 Jewels (feat. RZA),00:27
1,20 10 20,03:33
2,23,04:05
3,34,02:40
4,69 (feat. Burna Boy & Ikechukwu),04:06


In [72]:
# creating three new fields: year, month and day a song was released from release date column
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month_name()
df['day_of_the_week'] = df['release_date'].dt.day_name()

df[['track_name', 'year', 'month', 'day_of_the_week']].head()

Unnamed: 0,track_name,year,month,day_of_the_week
0,12 Jewels (feat. RZA),2023,August,Thursday
1,20 10 20,2020,October,Thursday
2,23,2020,August,Thursday
3,34,2019,March,Thursday
4,69 (feat. Burna Boy & Ikechukwu),2016,October,Saturday


In [73]:
# Time to remove the square brackets from the featured artist column 

# They have square brackets in the first place because it's a group of smaller lists in a larger list that was embbedded in the DataFrame

# We convert the data type to string no matter what, then remove the square brackets and apostrophe around it 

df['featured_artists'] = df['featured_artists'].astype(str)
df['featured_artists'] = df['featured_artists'].replace({"'": '', '"': '', r'\[':'', r'\]':''}, regex=True)
df[['track_name', 'featured_artists']].head()

Unnamed: 0,track_name,featured_artists
0,12 Jewels (feat. RZA),RZA
1,20 10 20,
2,23,
3,34,DJDS
4,69 (feat. Burna Boy & Ikechukwu),"Burna Boy, Ikechukwu"


In [74]:
key_dict = {
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B",
    -1: "NaN"
}

df['key'] = df['key'].map(key_dict).fillna("NaN")
df[['track_name', 'key']].head()

Unnamed: 0,track_name,key
0,12 Jewels (feat. RZA),F#/Gb
1,20 10 20,F#/Gb
2,23,F
3,34,F#/Gb
4,69 (feat. Burna Boy & Ikechukwu),C


In [95]:
df.to_csv('Burna Boy Discography.csv', index= False)