In [1]:
import spotipy
import pandas as pd
import requests
import numpy as np
import time
from requests.exceptions import ReadTimeout
from spotipy.exceptions import SpotifyException
from timeit import default_timer as timer
from datetime import timedelta
import configparser
from spotipy.oauth2 import SpotifyClientCredentials
from pandas.api.types import CategoricalDtype
import configparser
import os

In [5]:
# Spotify API tokens for access
current_dir = os.path.dirname(os.path.abspath("Burna Boy"))

config_path = os.path.join(current_dir, '..', 'config.ini')

config = configparser.ConfigParser()

# Read the config file
config.read(config_path)
            
client_id = config.get('credentials', 'Client_ID')
client_secret = config.get('credentials', 'Client_Secret')

client_credentials_manager = SpotifyClientCredentials(
                                client_id = client_id, client_secret = client_secret)

sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) 

In [2]:
# function to know how long it'll take to scrape the data

def format_time(seconds):
    minutes = seconds // 60
    seconds = seconds % 60
    if minutes > 0:
        return f"{minutes} minutes, {seconds} seconds"
    else:
        return f"{seconds} seconds"

In [None]:
# start time
start_time = timer()

# For Burna Boy as a main artist
artist_name = []
track_name = []
track_id = []
album_name = []
album_id = []
release_date = []
popularity = []
explicit = []
danceability = []
duration_ms = []
energy = []
key = []
loudness = []
mode = []
speechiness = []
acousticness = []
instrumentalness = []
liveness = []
valence = []
tempo = []
time_signature = []
featured_artists = []

# loop through results, using offset to get all tracks
for i in range(0, 1000, 50):
    try:
        track_results = sp.search(q='artist: Burna Boy', type='track', limit=50, offset=i)
        for i, t in enumerate(track_results['tracks']['items']):
        
            # get track details
            artist_name.append(t['artists'][0]['name'])
            track_name.append(t['name'])
            track_id.append(t['id'])
            album_name.append(t['album']['name'])
            album_id.append(t['album']['id'])
            release_date.append(t['album']['release_date'])
            popularity.append(t['popularity'])
            explicit.append(t['explicit'])

            # get audio features for track
            audio_features = sp.audio_features(t['id'])[0]
            danceability.append(audio_features['danceability'])
            duration_ms.append(audio_features['duration_ms'])
            energy.append(audio_features['energy'])
            key.append(audio_features['key'])
            loudness.append(audio_features['loudness'])
            mode.append(audio_features['mode'])
            speechiness.append(audio_features['speechiness'])
            acousticness.append(audio_features['acousticness'])
            instrumentalness.append(audio_features['instrumentalness'])
            liveness.append(audio_features['liveness'])
            valence.append(audio_features['valence'])
            tempo.append(audio_features['tempo'])
            time_signature.append(audio_features['time_signature'])
        

            # get featured artists
            if len(t['artists']) > 1:
                feat_artists = []
                for j in range(1, len(t['artists'])):
                    feat_artists.append(t['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
    except ReadTimeout as e:
        print(f"Error: {e}. Retrying in 5 seconds...")
        time.sleep(5)  # Retry after a short delay
    except SpotifyException as e:
        print(f"Spotify API Error: {e}")
        break

# create dataframe from lists
df_raw = pd.DataFrame({
    'artist_name': artist_name,
    'track_name': track_name,
    'track_id': track_id,
    'album_name': album_name,
    'album_id': album_id,
    'release_date': release_date,
    'duration_ms': duration_ms,
    'popularity': popularity,
    'explicit': explicit,
    'danceability': danceability,
    'energy': energy,
    'key': key,
    'loudness': loudness,
    'mode': mode,
    'speechiness': speechiness,
    'acousticness': acousticness,
    'instrumentalness': instrumentalness,
    'liveness': liveness,
    'valence': valence,
    'tempo': tempo,
    'time_signature': time_signature,
    'featured_artists': featured_artists
})



# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

In [9]:
# keeping records with only 'Burna Boy' as the artist name
df_burna = df_raw[df_raw['artist_name']=='Burna Boy'].copy()

In [10]:
# Burna's Spotify has one album with two spelling variations: 'On a Spaceship' and 'On A Spaceship' which are basically the same thing and this is proof 

spaceship = df_burna['album_name'].str.contains('spaceship', case=False)
proof = df_burna.loc[spaceship].sort_values(by= 'track_name', ascending=False)
proof.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists
65,Burna Boy,Soke,4bzS0aUqNJIKiXk86k4tol,On A Spaceship,6NW4thIIMVcb6Wedxqu6ev,2015-03-13,218867,47,False,0.647,...,-11.174,1,0.104,0.599,6e-06,0.102,0.883,203.868,4,[]
163,Burna Boy,Soke,1rFeMjIzNOngxR5sMS6A3I,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,218867,29,False,0.826,...,-11.18,1,0.0558,0.604,7e-06,0.106,0.876,101.989,4,[]
215,Burna Boy,Single (feat. Wizkid),5jMVBG8g9mpMPviRopnY4k,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,211027,23,False,0.862,...,-5.994,0,0.0677,0.326,4e-06,0.105,0.627,101.971,4,[Wizkid]
246,Burna Boy,Single,7E9HGMyzcPKMEgQ6wYCZU5,On A Spaceship,6NW4thIIMVcb6Wedxqu6ev,2015-03-13,211027,19,False,0.862,...,-5.986,0,0.0701,0.331,4e-06,0.106,0.647,101.974,4,[Wizkid]
232,Burna Boy,Ring Ring,6QmjfGeiky2j95ckhRqBFP,On a Spaceship,4tjD2jPZdPe3nqop8En6zb,2015-11-25,210840,20,False,0.625,...,-4.578,0,0.0518,0.283,0.00293,0.0942,0.964,102.323,4,[]


In [11]:
# Having examined the content of the two album spelling variations, we'll now keep one: 'On a Spaceship'
# It is noteworthy that 'On a Spaceship' is more complete than the other, hence this approach
df_burna.loc[:, 'album_name'] = df_burna['album_name'].replace('On A Spaceship', 'On a Spaceship')

# Filter the main DataFrame for the album 'On a Spaceship'
df_spaceship = df_burna[df_burna['album_name'] == 'On a Spaceship']

# Identify and keep rows with a higher popularity among duplicates based on duration_ms
df_no_duplicates = df_spaceship.sort_values(by=['duration_ms', 'popularity'], ascending=[False, False]).drop_duplicates(subset='duration_ms')

# Create a boolean mask to identify rows related to the album 'On a Spaceship'
mask_spaceship_album = df_burna['album_name'] == 'On a Spaceship'

# Update the main DataFrame by replacing the relevant rows with the processed data
df_burna.loc[mask_spaceship_album, :] = df_no_duplicates

# Reset the index of the main DataFrame
df_burna = df_burna.reset_index(drop=True)

 nan]' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  df_burna.loc[mask_spaceship_album, :] = df_no_duplicates


In [12]:
# creating a new column to categorize songs to either Singles, Albums, EPs, Compilations

# Do this separately for the main_track df and the ft df, call it features for the ft df, before concat

albums = ['I Told Them...', 
                    'Love, Damini',
                    'Twice As Tall',
                    'African Giant',
                    'Outside',
                    'On a Spaceship',
                    'L.I.F.E - Leaving an Impact for Eternity (Deluxe Edition)']

ep = ['Redemption', 'Steel & Copper']

compilation = ['Best of Burna Boy', 'Best of Burn Series, Vol. 1']

# Define conditions for each type
single_condition = df_burna['track_name'] == df_burna['album_name']
album_condition = df_burna['album_name'].isin(albums)
ep_condition = df_burna['album_name'].isin(ep)
compilation_condition = df_burna['album_name'].isin(compilation)

# Update 'type' based on conditions
df_burna.loc[single_condition, 'type'] = 'Single'
df_burna.loc[album_condition, 'type'] = 'Album'
df_burna.loc[ep_condition, 'type'] = 'EP'
df_burna.loc[compilation_condition, 'type'] = 'Compilation'

In [14]:
# scraping a playlist that contains all Burna Boy's features 

# start time
start_time = timer()

# Get playlist URI
playlist_uri = "https://open.spotify.com/playlist/0td2WSZ8vJjToLUjq3VELy?si=1d42b850ed6d421b"

# Get playlist tracks
playlist = sp.playlist(playlist_uri)
tracks = playlist['tracks']['items']

# Initialize data structures
song_features = []

# Define offset for pagination
offset = 0
limit = 100

# Iterate through playlist tracks, making additional requests as needed
while True:
    # Get playlist tracks with pagination
    results = sp.user_playlist_tracks(user="Uncle Pat", playlist_id=playlist_uri, offset=offset, limit=limit)

    # Check if there are more tracks to retrieve
    if not results['items']:
        break

    # Extract track information and audio features
    for track in results['items']:
        track_uri = track['track']['uri']
        track_name = track['track']['name']

        # Extract track information
        track_info = sp.track(track_uri)
        artist_name = track_info['artists'][0]['name']
        track_name = track_info['name']
        track_id = track_info['id']
        album_name = track_info['album']['name']
        album_id = track_info['album']['id']
        release_date = track_info['album']['release_date']
        duration_ms = track_info['duration_ms']
        popularity = track_info['popularity']
        explicit = track_info['explicit']
        
        
        # Extract audio features
        track_features = sp.audio_features(track_uri)

        # Check if audio features are available
        if track_features:
            track_features = track_features[0]

            # Extract audio features
            danceability = track_features['danceability']
            energy = track_features['energy']
            key = track_features['key']
            loudness = track_features['loudness']
            mode = track_features['mode']
            speechiness = track_features['speechiness']
            acousticness = track_features['acousticness']
            instrumentalness = track_features['instrumentalness']
            liveness = track_features['liveness']
            valence = track_features['valence']            
            tempo = track_features['tempo']
            time_signature = track_features['time_signature']
            
            # get featured artists
            featured_artists = []
            if len(track_info['artists']) > 1:
                feat_artists = []
                for j in range(1, len(track_info['artists'])):
                    feat_artists.append(track_info['artists'][j]['name'])
                featured_artists.append(feat_artists)
            else:
                featured_artists.append([])
   

            # Store data
            song_features.append([
                artist_name, track_name, track_id, album_name,
                album_id, release_date, duration_ms, popularity,
                explicit, danceability, energy, key, loudness,
                mode, speechiness, acousticness, instrumentalness,
                liveness, valence, tempo, time_signature, featured_artists
            ])
        else:
            print("Audio features not available for track:", track_name)

    # Update offset for next request
    offset += limit

# Save data to CSV file
ft = pd.DataFrame(
    song_features, 
    columns=[ 'artist_name', 'track_name', 'track_id', 'album_name', 'album_id',
                'release_date', 'duration_ms', 'popularity', 'explicit',
                'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                'acousticness', 'instrumentalness', 'liveness',
                'valence', 'tempo', 'time_signature', 'featured_artists'])

# end time and print
end_time = timer()
elapsed_time = int(end_time - start_time)
print(f"Elapsed time: {format_time(elapsed_time)}")

Elapsed time: 1 minutes, 2 seconds


In [15]:
ft.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists
0,Lil Durk,All My Life (Burna Boy Remix),6vyM1yoPhhdezAcW8CmCJq,All My Life (Remixes) (feat. Burna Boy),11OlFW8YpwZTlhAdc0NCV7,2023-10-13,264179,51,True,0.787,...,-5.892,1,0.226,0.071,0.0,0.139,0.698,142.945,4,"[[Burna Boy, J. Cole]]"
1,Burna Boy,Talibans II,5zq5fYXIE7X842DD5HBiiB,Talibans II,4EouhaKTJkW8xRpLsCz7va,2023-07-20,176453,54,True,0.914,...,-6.77,0,0.31,0.227,3e-06,0.132,0.54,102.937,4,[[Byron Messia]]
2,TitoM,Tshwala Bam (feat. S.N.E),6cyXHTix4NQ069gKJEYv41,Tshwala Bam (feat. S.N.E),1ZyNY4zHrl8MMzZ5li2Pm2,2024-05-13,203571,67,False,0.813,...,-12.066,0,0.0426,0.0639,0.00667,0.0951,0.517,112.022,4,"[[Yuppe, Burna Boy, S.N.E]]"
3,J Hus,Masculine (feat. Burna Boy),4M6yrN4rJUrwNgHzGKfrN1,Beautiful And Brutal Yard,766bxryPZBL0hjz0KM6VUD,2023-07-14,204880,54,True,0.675,...,-5.977,1,0.291,0.179,2e-05,0.127,0.713,66.98,5,[[Burna Boy]]
4,21 Savage,just like me,5KI7I4mEtulXcv5VQJaV35,american dream,2RRYaYHY7fIIdvFlvgb5vq,2024-01-12,231338,60,True,0.701,...,-4.695,1,0.105,0.479,2.8e-05,0.306,0.505,82.984,4,"[[Burna Boy, Metro Boomin]]"


In [16]:
# creating a new column to categorize songs from this playlist to Features
ft['type']='Feature' 

In [18]:
# joining the df_burna and ft dataframes
df = pd.concat([df_burna, ft], ignore_index= True, sort= False)

In [19]:
# Eliminating rows that share the same track name 

# The main track has higher popularity than duplicate tracks that share the same name hence it was an elimination by popularity

df = df.sort_values(by= ['track_name', 'popularity'], ascending= [True, False])
df.drop_duplicates(subset= 'track_name', keep= 'first', inplace= True, ignore_index= True)

In [26]:
# check the records that wasn't assigned any song type ie. single, album, feature, ep, compilation
notype = df['type'].isna()
flush = df.loc[notype].sort_values(by= 'popularity', ascending= False) 
flush.head()

Unnamed: 0,artist_name,track_name,track_id,album_name,album_id,release_date,duration_ms,popularity,explicit,danceability,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,featured_artists,type


In [27]:
# flush them
index_flush = flush.index
df = df.drop(index_flush)

In [28]:
# modifying the 'mode' column by replacing 0 with Minor and 1 with Major
df['mode'].replace({0:'Minor', 1:'Major'}, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['mode'].replace({0:'Minor', 1:'Major'}, inplace= True)


In [29]:
# modifying the duration_ms column to appear as minutes and seconds 

df['duration_ms'] = pd.to_timedelta(df['duration_ms'], unit='ms')

# extract the minutes and seconds components of the duration as strings
df['duration'] = df['duration_ms'].dt.components['minutes'].astype(str).str.zfill(2) + ':' + df['duration_ms'].dt.components['seconds'].astype(str).str.zfill(2)
df.drop(columns=['duration_ms'], inplace= True)
df[['track_name','duration']].head()

Unnamed: 0,track_name,duration
0,#Yawadey,03:57
1,12 Jewels (feat. RZA),00:27
2,20 10 20,03:33
3,23,04:05
4,34,02:40


In [30]:
# creating three new fields: year, month and day a song was released from release date column
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month_name()
df['day_of_the_week'] = df['release_date'].dt.day_name()

df[['track_name', 'year', 'month', 'day_of_the_week']].head()

Unnamed: 0,track_name,year,month,day_of_the_week
0,#Yawadey,2013,August,Monday
1,12 Jewels (feat. RZA),2023,August,Thursday
2,20 10 20,2020,October,Thursday
3,23,2020,August,Thursday
4,34,2019,March,Thursday


In [31]:
# Time to remove the square brackets from the featured artist column 

# They have square brackets in the first place because it's a group of smaller lists in a larger list that was embbedded in the DataFrame

# We convert the data type to string no matter what, then remove the square brackets and apostrophe around it 

df['featured_artists'] = df['featured_artists'].astype(str)
df['featured_artists'] = df['featured_artists'].replace({"'": '', '"': '', r'\[':'', r'\]':''}, regex=True)
df[['track_name', 'featured_artists']].head()

Unnamed: 0,track_name,featured_artists
0,#Yawadey,
1,12 Jewels (feat. RZA),RZA
2,20 10 20,
3,23,
4,34,DJDS


In [32]:
key_dict = {
    0: "C",
    1: "C#/Db",
    2: "D",
    3: "D#/Eb",
    4: "E",
    5: "F",
    6: "F#/Gb",
    7: "G",
    8: "G#/Ab",
    9: "A",
    10: "A#/Bb",
    11: "B",
    -1: "NaN"
}

df['key'] = df['key'].map(key_dict).fillna("NaN")
df[['track_name', 'key']].head()

Unnamed: 0,track_name,key
0,#Yawadey,F#/Gb
1,12 Jewels (feat. RZA),F#/Gb
2,20 10 20,F#/Gb
3,23,F
4,34,F#/Gb


In [33]:
df.to_csv('Burna Boy Discography.csv', index= False)